aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/cache.c1
-rw-r--r--fs/9p/fid.c13
-rw-r--r--fs/9p/v9fs.c32
-rw-r--r--fs/9p/v9fs.h3
-rw-r--r--fs/9p/v9fs_vfs.h2
-rw-r--r--fs/9p/vfs_dentry.c1
-rw-r--r--fs/9p/vfs_dir.c11
-rw-r--r--fs/9p/vfs_file.c17
-rw-r--r--fs/9p/vfs_inode.c121
-rw-r--r--fs/9p/vfs_super.c60
-rw-r--r--fs/Makefile2
-rw-r--r--fs/adfs/dir.c2
-rw-r--r--fs/adfs/file.c2
-rw-r--r--fs/adfs/inode.c3
-rw-r--r--fs/adfs/super.c1
-rw-r--r--fs/affs/affs.h2
-rw-r--r--fs/affs/bitmap.c1
-rw-r--r--fs/affs/file.c4
-rw-r--r--fs/affs/inode.c1
-rw-r--r--fs/affs/namei.c2
-rw-r--r--fs/affs/super.c1
-rw-r--r--fs/afs/cache.c1
-rw-r--r--fs/afs/cmservice.c1
-rw-r--r--fs/afs/dir.c7
-rw-r--r--fs/afs/file.c66
-rw-r--r--fs/afs/fsclient.c1
-rw-r--r--fs/afs/inode.c1
-rw-r--r--fs/afs/internal.h5
-rw-r--r--fs/afs/mntpt.c32
-rw-r--r--fs/afs/rxrpc.c1
-rw-r--r--fs/afs/security.c5
-rw-r--r--fs/afs/super.c1
-rw-r--r--fs/afs/vlclient.c1
-rw-r--r--fs/afs/vlocation.c1
-rw-r--r--fs/afs/vnode.c1
-rw-r--r--fs/afs/volume.c7
-rw-r--r--fs/afs/write.c3
-rw-r--r--fs/aio.c71
-rw-r--r--fs/anon_inodes.c1
-rw-r--r--fs/attr.c50
-rw-r--r--fs/autofs/root.c2
-rw-r--r--fs/autofs4/dev-ioctl.c19
-rw-r--r--fs/autofs4/root.c28
-rw-r--r--fs/bad_inode.c3
-rw-r--r--fs/befs/datastream.c1
-rw-r--r--fs/bfs/dir.c6
-rw-r--r--fs/binfmt_aout.c16
-rw-r--r--fs/binfmt_elf_fdpic.c9
-rw-r--r--fs/binfmt_em86.c1
-rw-r--r--fs/binfmt_flat.c2
-rw-r--r--fs/binfmt_script.c1
-rw-r--r--fs/bio-integrity.c1
-rw-r--r--fs/bio.c4
-rw-r--r--fs/block_dev.c366
-rw-r--r--fs/btrfs/acl.c5
-rw-r--r--fs/btrfs/async-thread.c2
-rw-r--r--fs/btrfs/btrfs_inode.h3
-rw-r--r--fs/btrfs/compression.c23
-rw-r--r--fs/btrfs/ctree.c114
-rw-r--r--fs/btrfs/ctree.h167
-rw-r--r--fs/btrfs/delayed-ref.c102
-rw-r--r--fs/btrfs/delayed-ref.h3
-rw-r--r--fs/btrfs/disk-io.c186
-rw-r--r--fs/btrfs/disk-io.h4
-rw-r--r--fs/btrfs/extent-tree.c2263
-rw-r--r--fs/btrfs/extent_io.c101
-rw-r--r--fs/btrfs/extent_io.h14
-rw-r--r--fs/btrfs/extent_map.c1
-rw-r--r--fs/btrfs/file-item.c29
-rw-r--r--fs/btrfs/file.c170
-rw-r--r--fs/btrfs/free-space-cache.c1
-rw-r--r--fs/btrfs/inode-item.c27
-rw-r--r--fs/btrfs/inode.c1780
-rw-r--r--fs/btrfs/ioctl.c219
-rw-r--r--fs/btrfs/locking.c1
-rw-r--r--fs/btrfs/ordered-data.c87
-rw-r--r--fs/btrfs/ordered-data.h9
-rw-r--r--fs/btrfs/ref-cache.c1
-rw-r--r--fs/btrfs/relocation.c1972
-rw-r--r--fs/btrfs/root-tree.c23
-rw-r--r--fs/btrfs/super.c59
-rw-r--r--fs/btrfs/transaction.c315
-rw-r--r--fs/btrfs/transaction.h24
-rw-r--r--fs/btrfs/tree-defrag.c7
-rw-r--r--fs/btrfs/tree-log.c242
-rw-r--r--fs/btrfs/tree-log.h2
-rw-r--r--fs/btrfs/volumes.c34
-rw-r--r--fs/btrfs/xattr.c14
-rw-r--r--fs/btrfs/xattr.h6
-rw-r--r--fs/buffer.c149
-rw-r--r--fs/cachefiles/interface.c1
-rw-r--r--fs/cachefiles/internal.h1
-rw-r--r--fs/cachefiles/namei.c99
-rw-r--r--fs/cachefiles/rdwr.c1
-rw-r--r--fs/cachefiles/security.c4
-rw-r--r--fs/cachefiles/xattr.c1
-rw-r--r--fs/ceph/addr.c92
-rw-r--r--fs/ceph/auth.c8
-rw-r--r--fs/ceph/auth.h8
-rw-r--r--fs/ceph/auth_none.c10
-rw-r--r--fs/ceph/auth_none.h2
-rw-r--r--fs/ceph/auth_x.c117
-rw-r--r--fs/ceph/buffer.c3
-rw-r--r--fs/ceph/caps.c165
-rw-r--r--fs/ceph/ceph_fs.h83
-rw-r--r--fs/ceph/ceph_strings.c16
-rw-r--r--fs/ceph/crypto.c1
-rw-r--r--fs/ceph/debugfs.c14
-rw-r--r--fs/ceph/dir.c69
-rw-r--r--fs/ceph/export.c17
-rw-r--r--fs/ceph/file.c25
-rw-r--r--fs/ceph/inode.c131
-rw-r--r--fs/ceph/ioctl.c2
-rw-r--r--fs/ceph/mds_client.c470
-rw-r--r--fs/ceph/mds_client.h6
-rw-r--r--fs/ceph/messenger.c165
-rw-r--r--fs/ceph/messenger.h13
-rw-r--r--fs/ceph/mon_client.c263
-rw-r--r--fs/ceph/mon_client.h27
-rw-r--r--fs/ceph/msgpool.c180
-rw-r--r--fs/ceph/msgpool.h12
-rw-r--r--fs/ceph/msgr.h21
-rw-r--r--fs/ceph/osd_client.c160
-rw-r--r--fs/ceph/osd_client.h5
-rw-r--r--fs/ceph/osdmap.c232
-rw-r--r--fs/ceph/osdmap.h3
-rw-r--r--fs/ceph/pagelist.c3
-rw-r--r--fs/ceph/rados.h30
-rw-r--r--fs/ceph/snap.c59
-rw-r--r--fs/ceph/super.c171
-rw-r--r--fs/ceph/super.h36
-rw-r--r--fs/ceph/xattr.c36
-rw-r--r--fs/cifs/asn1.c103
-rw-r--r--fs/cifs/cifs_debug.c48
-rw-r--r--fs/cifs/cifs_debug.h42
-rw-r--r--fs/cifs/cifs_dfs_ref.c35
-rw-r--r--fs/cifs/cifs_fs_sb.h3
-rw-r--r--fs/cifs/cifs_spnego.c7
-rw-r--r--fs/cifs/cifs_unicode.c6
-rw-r--r--fs/cifs/cifsacl.c77
-rw-r--r--fs/cifs/cifsencrypt.c11
-rw-r--r--fs/cifs/cifsfs.c174
-rw-r--r--fs/cifs/cifsfs.h4
-rw-r--r--fs/cifs/cifsglob.h11
-rw-r--r--fs/cifs/cifsproto.h30
-rw-r--r--fs/cifs/cifssmb.c472
-rw-r--r--fs/cifs/connect.c640
-rw-r--r--fs/cifs/dir.c91
-rw-r--r--fs/cifs/dns_resolve.c17
-rw-r--r--fs/cifs/export.c2
-rw-r--r--fs/cifs/file.c249
-rw-r--r--fs/cifs/inode.c145
-rw-r--r--fs/cifs/ioctl.c10
-rw-r--r--fs/cifs/link.c11
-rw-r--r--fs/cifs/misc.c81
-rw-r--r--fs/cifs/netmisc.c16
-rw-r--r--fs/cifs/readdir.c86
-rw-r--r--fs/cifs/sess.c82
-rw-r--r--fs/cifs/smbencrypt.c1
-rw-r--r--fs/cifs/transport.c93
-rw-r--r--fs/cifs/xattr.c41
-rw-r--r--fs/coda/coda_int.h3
-rw-r--r--fs/coda/dir.c1
-rw-r--r--fs/coda/file.c7
-rw-r--r--fs/coda/inode.c9
-rw-r--r--fs/coda/pioctl.c76
-rw-r--r--fs/coda/psdev.c5
-rw-r--r--fs/coda/upcall.c1
-rw-r--r--fs/compat.c135
-rw-r--r--fs/compat_ioctl.c5
-rw-r--r--fs/configfs/dir.c4
-rw-r--r--fs/configfs/inode.c10
-rw-r--r--fs/configfs/mount.c1
-rw-r--r--fs/configfs/symlink.c1
-rw-r--r--fs/dcache.c20
-rw-r--r--fs/debugfs/file.c21
-rw-r--r--fs/debugfs/inode.c1
-rw-r--r--fs/devpts/inode.c10
-rw-r--r--fs/direct-io.c123
-rw-r--r--fs/dlm/config.c1
-rw-r--r--fs/dlm/debug_fs.c1
-rw-r--r--fs/dlm/lock.c6
-rw-r--r--fs/dlm/lowcomms.c1
-rw-r--r--fs/dlm/netlink.c1
-rw-r--r--fs/dlm/plock.c1
-rw-r--r--fs/dlm/user.c89
-rw-r--r--fs/drop_caches.c24
-rw-r--r--fs/ecryptfs/crypto.c38
-rw-r--r--fs/ecryptfs/dentry.c1
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h20
-rw-r--r--fs/ecryptfs/file.c7
-rw-r--r--fs/ecryptfs/inode.c182
-rw-r--r--fs/ecryptfs/keystore.c1
-rw-r--r--fs/ecryptfs/kthread.c1
-rw-r--r--fs/ecryptfs/main.c159
-rw-r--r--fs/ecryptfs/messaging.c1
-rw-r--r--fs/ecryptfs/miscdev.c1
-rw-r--r--fs/ecryptfs/mmap.c58
-rw-r--r--fs/ecryptfs/read_write.c13
-rw-r--r--fs/ecryptfs/super.c23
-rw-r--r--fs/eventfd.c1
-rw-r--r--fs/eventpoll.c3
-rw-r--r--fs/exec.c204
-rw-r--r--fs/exofs/dir.c2
-rw-r--r--fs/exofs/exofs.h2
-rw-r--r--fs/exofs/file.c7
-rw-r--r--fs/exofs/inode.c42
-rw-r--r--fs/exofs/ios.c1
-rw-r--r--fs/exofs/super.c9
-rw-r--r--fs/ext2/acl.c4
-rw-r--r--fs/ext2/balloc.c7
-rw-r--r--fs/ext2/ext2.h3
-rw-r--r--fs/ext2/file.c7
-rw-r--r--fs/ext2/ialloc.c21
-rw-r--r--fs/ext2/inode.c160
-rw-r--r--fs/ext2/super.c117
-rw-r--r--fs/ext2/symlink.c2
-rw-r--r--fs/ext2/xattr.c12
-rw-r--r--fs/ext2/xattr.h12
-rw-r--r--fs/ext2/xattr_security.c3
-rw-r--r--fs/ext2/xattr_trusted.c2
-rw-r--r--fs/ext2/xattr_user.c2
-rw-r--r--fs/ext3/acl.c4
-rw-r--r--fs/ext3/balloc.c7
-rw-r--r--fs/ext3/dir.c2
-rw-r--r--fs/ext3/fsync.c27
-rw-r--r--fs/ext3/ialloc.c17
-rw-r--r--fs/ext3/inode.c4
-rw-r--r--fs/ext3/super.c115
-rw-r--r--fs/ext3/symlink.c2
-rw-r--r--fs/ext3/xattr.c10
-rw-r--r--fs/ext3/xattr.h12
-rw-r--r--fs/ext3/xattr_security.c3
-rw-r--r--fs/ext3/xattr_trusted.c2
-rw-r--r--fs/ext3/xattr_user.c2
-rw-r--r--fs/ext4/acl.c4
-rw-r--r--fs/ext4/balloc.c5
-rw-r--r--fs/ext4/block_validity.c5
-rw-r--r--fs/ext4/dir.c26
-rw-r--r--fs/ext4/ext4.h169
-rw-r--r--fs/ext4/ext4_jbd2.h8
-rw-r--r--fs/ext4/extents.c418
-rw-r--r--fs/ext4/file.c2
-rw-r--r--fs/ext4/fsync.c47
-rw-r--r--fs/ext4/ialloc.c103
-rw-r--r--fs/ext4/inode.c733
-rw-r--r--fs/ext4/ioctl.c27
-rw-r--r--fs/ext4/mballoc.c140
-rw-r--r--fs/ext4/migrate.c3
-rw-r--r--fs/ext4/move_extent.c14
-rw-r--r--fs/ext4/namei.c61
-rw-r--r--fs/ext4/resize.c3
-rw-r--r--fs/ext4/super.c146
-rw-r--r--fs/ext4/symlink.c2
-rw-r--r--fs/ext4/xattr.c49
-rw-r--r--fs/ext4/xattr.h12
-rw-r--r--fs/ext4/xattr_security.c3
-rw-r--r--fs/ext4/xattr_trusted.c2
-rw-r--r--fs/ext4/xattr_user.c2
-rw-r--r--fs/fat/cache.c14
-rw-r--r--fs/fat/dir.c28
-rw-r--r--fs/fat/fat.h22
-rw-r--r--fs/fat/file.c59
-rw-r--r--fs/fat/inode.c43
-rw-r--r--fs/fat/misc.c22
-rw-r--r--fs/fat/namei_vfat.c6
-rw-r--r--fs/fcntl.c71
-rw-r--r--fs/fifo.c1
-rw-r--r--fs/file_table.c21
-rw-r--r--fs/filesystems.c2
-rw-r--r--fs/freevxfs/vxfs_lookup.c2
-rw-r--r--fs/freevxfs/vxfs_subr.c1
-rw-r--r--fs/fs-writeback.c242
-rw-r--r--fs/fscache/object-list.c3
-rw-r--r--fs/fscache/object.c6
-rw-r--r--fs/fscache/operation.c5
-rw-r--r--fs/fscache/page.c2
-rw-r--r--fs/fscache/stats.c4
-rw-r--r--fs/fuse/cuse.c1
-rw-r--r--fs/fuse/dev.c528
-rw-r--r--fs/fuse/dir.c5
-rw-r--r--fs/fuse/file.c48
-rw-r--r--fs/fuse/fuse_i.h6
-rw-r--r--fs/generic_acl.c5
-rw-r--r--fs/gfs2/acl.c6
-rw-r--r--fs/gfs2/acl.h2
-rw-r--r--fs/gfs2/aops.c16
-rw-r--r--fs/gfs2/bmap.c18
-rw-r--r--fs/gfs2/dentry.c1
-rw-r--r--fs/gfs2/dir.c2
-rw-r--r--fs/gfs2/export.c3
-rw-r--r--fs/gfs2/file.c11
-rw-r--r--fs/gfs2/glock.c3
-rw-r--r--fs/gfs2/glops.c1
-rw-r--r--fs/gfs2/incore.h11
-rw-r--r--fs/gfs2/inode.c103
-rw-r--r--fs/gfs2/inode.h4
-rw-r--r--fs/gfs2/lock_dlm.c1
-rw-r--r--fs/gfs2/log.c160
-rw-r--r--fs/gfs2/log.h30
-rw-r--r--fs/gfs2/lops.c2
-rw-r--r--fs/gfs2/main.c2
-rw-r--r--fs/gfs2/meta_io.c5
-rw-r--r--fs/gfs2/ops_fstype.c19
-rw-r--r--fs/gfs2/ops_inode.c5
-rw-r--r--fs/gfs2/quota.c114
-rw-r--r--fs/gfs2/rgrp.c81
-rw-r--r--fs/gfs2/rgrp.h2
-rw-r--r--fs/gfs2/super.c11
-rw-r--r--fs/gfs2/super.h2
-rw-r--r--fs/gfs2/sys.c7
-rw-r--r--fs/gfs2/trans.c18
-rw-r--r--fs/gfs2/util.c1
-rw-r--r--fs/gfs2/xattr.c6
-rw-r--r--fs/hfs/bnode.c1
-rw-r--r--fs/hfs/btree.c1
-rw-r--r--fs/hfs/mdb.c1
-rw-r--r--fs/hfs/super.c1
-rw-r--r--fs/hfsplus/dir.c2
-rw-r--r--fs/hfsplus/hfsplus_fs.h3
-rw-r--r--fs/hfsplus/inode.c2
-rw-r--r--fs/hfsplus/ioctl.c12
-rw-r--r--fs/hfsplus/options.c1
-rw-r--r--fs/hostfs/hostfs_kern.c5
-rw-r--r--fs/hpfs/buffer.c1
-rw-r--r--fs/hpfs/dir.c1
-rw-r--r--fs/hpfs/file.c4
-rw-r--r--fs/hpfs/hpfs_fn.h2
-rw-r--r--fs/hpfs/inode.c1
-rw-r--r--fs/hpfs/super.c1
-rw-r--r--fs/hppfs/hppfs.c2
-rw-r--r--fs/hugetlbfs/inode.c2
-rw-r--r--fs/inode.c28
-rw-r--r--fs/internal.h2
-rw-r--r--fs/ioctl.c107
-rw-r--r--fs/ioprio.c1
-rw-r--r--fs/isofs/dir.c2
-rw-r--r--fs/isofs/namei.c1
-rw-r--r--fs/jbd/commit.c9
-rw-r--r--fs/jbd/journal.c33
-rw-r--r--fs/jbd/recovery.c1
-rw-r--r--fs/jbd2/checkpoint.c3
-rw-r--r--fs/jbd2/commit.c6
-rw-r--r--fs/jbd2/journal.c2
-rw-r--r--fs/jbd2/recovery.c1
-rw-r--r--fs/jbd2/transaction.c5
-rw-r--r--fs/jffs2/acl.c4
-rw-r--r--fs/jffs2/acl.h4
-rw-r--r--fs/jffs2/background.c3
-rw-r--r--fs/jffs2/compr_lzo.c1
-rw-r--r--fs/jffs2/compr_zlib.c1
-rw-r--r--fs/jffs2/debug.c1
-rw-r--r--fs/jffs2/erase.c12
-rw-r--r--fs/jffs2/file.c5
-rw-r--r--fs/jffs2/fs.c14
-rw-r--r--fs/jffs2/gc.c17
-rw-r--r--fs/jffs2/nodelist.c1
-rw-r--r--fs/jffs2/nodelist.h10
-rw-r--r--fs/jffs2/nodemgmt.c29
-rw-r--r--fs/jffs2/os-linux.h5
-rw-r--r--fs/jffs2/scan.c4
-rw-r--r--fs/jffs2/security.c2
-rw-r--r--fs/jffs2/super.c2
-rw-r--r--fs/jffs2/symlink.c1
-rw-r--r--fs/jffs2/wbuf.c8
-rw-r--r--fs/jffs2/write.c1
-rw-r--r--fs/jffs2/xattr.c8
-rw-r--r--fs/jffs2/xattr.h8
-rw-r--r--fs/jffs2/xattr_trusted.c2
-rw-r--r--fs/jffs2/xattr_user.c2
-rw-r--r--fs/jfs/acl.c1
-rw-r--r--fs/jfs/file.c6
-rw-r--r--fs/jfs/inode.c2
-rw-r--r--fs/jfs/jfs_dmap.c19
-rw-r--r--fs/jfs/jfs_dmap.h6
-rw-r--r--fs/jfs/jfs_dtree.c1
-rw-r--r--fs/jfs/jfs_imap.c1
-rw-r--r--fs/jfs/jfs_inode.c12
-rw-r--r--fs/jfs/jfs_inode.h3
-rw-r--r--fs/jfs/jfs_logmgr.c1
-rw-r--r--fs/jfs/jfs_metapage.c1
-rw-r--r--fs/jfs/jfs_unicode.h1
-rw-r--r--fs/jfs/namei.c4
-rw-r--r--fs/jfs/resize.c6
-rw-r--r--fs/jfs/super.c30
-rw-r--r--fs/jfs/symlink.c14
-rw-r--r--fs/jfs/xattr.c1
-rw-r--r--fs/libfs.c144
-rw-r--r--fs/lockd/clntlock.c1
-rw-r--r--fs/lockd/clntproc.c1
-rw-r--r--fs/lockd/mon.c1
-rw-r--r--fs/lockd/svc.c1
-rw-r--r--fs/lockd/svc4proc.c1
-rw-r--r--fs/lockd/svclock.c1
-rw-r--r--fs/lockd/svcproc.c1
-rw-r--r--fs/lockd/svcsubs.c1
-rw-r--r--fs/logfs/dev_bdev.c16
-rw-r--r--fs/logfs/dev_mtd.c26
-rw-r--r--fs/logfs/dir.c8
-rw-r--r--fs/logfs/file.c20
-rw-r--r--fs/logfs/gc.c58
-rw-r--r--fs/logfs/inode.c16
-rw-r--r--fs/logfs/journal.c44
-rw-r--r--fs/logfs/logfs.h33
-rw-r--r--fs/logfs/logfs_abi.h10
-rw-r--r--fs/logfs/readwrite.c106
-rw-r--r--fs/logfs/segment.c70
-rw-r--r--fs/logfs/super.c45
-rw-r--r--fs/minix/bitmap.c5
-rw-r--r--fs/minix/dir.c7
-rw-r--r--fs/minix/file.c2
-rw-r--r--fs/minix/itree_v1.c1
-rw-r--r--fs/minix/itree_v2.c27
-rw-r--r--fs/minix/minix.h2
-rw-r--r--fs/minix/namei.c11
-rw-r--r--fs/mpage.c1
-rw-r--r--fs/namei.c50
-rw-r--r--fs/namespace.c19
-rw-r--r--fs/ncpfs/dir.c4
-rw-r--r--fs/ncpfs/file.c5
-rw-r--r--fs/ncpfs/inode.c8
-rw-r--r--fs/ncpfs/ioctl.c28
-rw-r--r--fs/ncpfs/mmap.c2
-rw-r--r--fs/ncpfs/sock.c1
-rw-r--r--fs/ncpfs/symlink.c1
-rw-r--r--fs/nfs/cache_lib.c1
-rw-r--r--fs/nfs/callback_proc.c1
-rw-r--r--fs/nfs/callback_xdr.c1
-rw-r--r--fs/nfs/client.c61
-rw-r--r--fs/nfs/delegation.c89
-rw-r--r--fs/nfs/dir.c154
-rw-r--r--fs/nfs/direct.c1
-rw-r--r--fs/nfs/dns_resolve.c1
-rw-r--r--fs/nfs/file.c25
-rw-r--r--fs/nfs/fscache.c4
-rw-r--r--fs/nfs/getroot.c191
-rw-r--r--fs/nfs/inode.c67
-rw-r--r--fs/nfs/internal.h4
-rw-r--r--fs/nfs/iostat.h6
-rw-r--r--fs/nfs/namespace.c21
-rw-r--r--fs/nfs/nfs2xdr.c1
-rw-r--r--fs/nfs/nfs3acl.c24
-rw-r--r--fs/nfs/nfs3proc.c129
-rw-r--r--fs/nfs/nfs3xdr.c3
-rw-r--r--fs/nfs/nfs4_fs.h8
-rw-r--r--fs/nfs/nfs4namespace.c13
-rw-r--r--fs/nfs/nfs4proc.c185
-rw-r--r--fs/nfs/nfs4state.c36
-rw-r--r--fs/nfs/nfs4xdr.c25
-rw-r--r--fs/nfs/nfsroot.c14
-rw-r--r--fs/nfs/pagelist.c14
-rw-r--r--fs/nfs/proc.c145
-rw-r--r--fs/nfs/read.c4
-rw-r--r--fs/nfs/super.c155
-rw-r--r--fs/nfs/symlink.c1
-rw-r--r--fs/nfs/unlink.c4
-rw-r--r--fs/nfs/write.c107
-rw-r--r--fs/nfs_common/nfsacl.c1
-rw-r--r--fs/nfsd/export.c45
-rw-r--r--fs/nfsd/nfs2acl.c1
-rw-r--r--fs/nfsd/nfs3acl.c1
-rw-r--r--fs/nfsd/nfs4acl.c1
-rw-r--r--fs/nfsd/nfs4callback.c141
-rw-r--r--fs/nfsd/nfs4idmap.c1
-rw-r--r--fs/nfsd/nfs4proc.c51
-rw-r--r--fs/nfsd/nfs4recover.c88
-rw-r--r--fs/nfsd/nfs4state.c377
-rw-r--r--fs/nfsd/nfs4xdr.c36
-rw-r--r--fs/nfsd/nfscache.c2
-rw-r--r--fs/nfsd/nfsctl.c69
-rw-r--r--fs/nfsd/nfsd.h6
-rw-r--r--fs/nfsd/nfssvc.c2
-rw-r--r--fs/nfsd/state.h47
-rw-r--r--fs/nfsd/vfs.c14
-rw-r--r--fs/nfsd/vfs.h1
-rw-r--r--fs/nfsd/xdr4.h11
-rw-r--r--fs/nilfs2/alloc.c157
-rw-r--r--fs/nilfs2/alloc.h7
-rw-r--r--fs/nilfs2/btnode.c1
-rw-r--r--fs/nilfs2/btree.c93
-rw-r--r--fs/nilfs2/btree.h23
-rw-r--r--fs/nilfs2/file.c4
-rw-r--r--fs/nilfs2/gcinode.c1
-rw-r--r--fs/nilfs2/inode.c16
-rw-r--r--fs/nilfs2/ioctl.c3
-rw-r--r--fs/nilfs2/mdt.c1
-rw-r--r--fs/nilfs2/nilfs.h2
-rw-r--r--fs/nilfs2/page.c1
-rw-r--r--fs/nilfs2/recovery.c3
-rw-r--r--fs/nilfs2/segbuf.c79
-rw-r--r--fs/nilfs2/segbuf.h10
-rw-r--r--fs/nilfs2/segment.c171
-rw-r--r--fs/nilfs2/segment.h6
-rw-r--r--fs/nilfs2/super.c219
-rw-r--r--fs/nilfs2/the_nilfs.c14
-rw-r--r--fs/nilfs2/the_nilfs.h1
-rw-r--r--fs/notify/fsnotify.c1
-rw-r--r--fs/notify/inode_mark.c1
-rw-r--r--fs/notify/inotify/Kconfig1
-rw-r--r--fs/notify/inotify/inotify.c88
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c2
-rw-r--r--fs/notify/inotify/inotify_user.c16
-rw-r--r--fs/ntfs/aops.c1
-rw-r--r--fs/ntfs/attrib.c1
-rw-r--r--fs/ntfs/compress.c1
-rw-r--r--fs/ntfs/dir.c6
-rw-r--r--fs/ntfs/file.c38
-rw-r--r--fs/ntfs/index.c2
-rw-r--r--fs/ntfs/mft.c1
-rw-r--r--fs/ntfs/namei.c1
-rw-r--r--fs/ocfs2/Makefile1
-rw-r--r--fs/ocfs2/acl.c82
-rw-r--r--fs/ocfs2/alloc.c908
-rw-r--r--fs/ocfs2/alloc.h12
-rw-r--r--fs/ocfs2/aops.c3
-rw-r--r--fs/ocfs2/blockcheck.c4
-rw-r--r--fs/ocfs2/buffer_head_io.c3
-rw-r--r--fs/ocfs2/cluster/heartbeat.c1
-rw-r--r--fs/ocfs2/cluster/masklog.c1
-rw-r--r--fs/ocfs2/cluster/masklog.h1
-rw-r--r--fs/ocfs2/cluster/nodemanager.c1
-rw-r--r--fs/ocfs2/cluster/quorum.c1
-rw-r--r--fs/ocfs2/cluster/tcp.c3
-rw-r--r--fs/ocfs2/dir.c75
-rw-r--r--fs/ocfs2/dlm/dlmast.c14
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h4
-rw-r--r--fs/ocfs2/dlm/dlmconvert.c5
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c28
-rw-r--r--fs/ocfs2/dlm/dlmlock.c6
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c34
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c27
-rw-r--r--fs/ocfs2/dlm/dlmthread.c17
-rw-r--r--fs/ocfs2/dlm/dlmunlock.c4
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c14
-rw-r--r--fs/ocfs2/dlmglue.c3
-rw-r--r--fs/ocfs2/extent_map.c1
-rw-r--r--fs/ocfs2/file.c283
-rw-r--r--fs/ocfs2/heartbeat.c1
-rw-r--r--fs/ocfs2/inode.c129
-rw-r--r--fs/ocfs2/inode.h4
-rw-r--r--fs/ocfs2/journal.c26
-rw-r--r--fs/ocfs2/journal.h15
-rw-r--r--fs/ocfs2/localalloc.c285
-rw-r--r--fs/ocfs2/localalloc.h3
-rw-r--r--fs/ocfs2/locks.c2
-rw-r--r--fs/ocfs2/mmap.c49
-rw-r--r--fs/ocfs2/namei.c184
-rw-r--r--fs/ocfs2/ocfs2.h36
-rw-r--r--fs/ocfs2/ocfs2_fs.h144
-rw-r--r--fs/ocfs2/quota.h12
-rw-r--r--fs/ocfs2/quota_global.c352
-rw-r--r--fs/ocfs2/quota_local.c184
-rw-r--r--fs/ocfs2/refcounttree.c79
-rw-r--r--fs/ocfs2/refcounttree.h4
-rw-r--r--fs/ocfs2/reservations.c847
-rw-r--r--fs/ocfs2/reservations.h159
-rw-r--r--fs/ocfs2/resize.c19
-rw-r--r--fs/ocfs2/stack_o2cb.c1
-rw-r--r--fs/ocfs2/stack_user.c1
-rw-r--r--fs/ocfs2/suballoc.c817
-rw-r--r--fs/ocfs2/suballoc.h26
-rw-r--r--fs/ocfs2/super.c142
-rw-r--r--fs/ocfs2/super.h7
-rw-r--r--fs/ocfs2/sysfile.c1
-rw-r--r--fs/ocfs2/xattr.c127
-rw-r--r--fs/ocfs2/xattr.h12
-rw-r--r--fs/omfs/file.c2
-rw-r--r--fs/omfs/inode.c6
-rw-r--r--fs/open.c168
-rw-r--r--fs/partitions/acorn.c68
-rw-r--r--fs/partitions/acorn.h10
-rw-r--r--fs/partitions/amiga.c13
-rw-r--r--fs/partitions/amiga.h2
-rw-r--r--fs/partitions/atari.c8
-rw-r--r--fs/partitions/atari.h2
-rw-r--r--fs/partitions/check.c85
-rw-r--r--fs/partitions/check.h12
-rw-r--r--fs/partitions/efi.c94
-rw-r--r--fs/partitions/efi.h2
-rw-r--r--fs/partitions/ibm.c21
-rw-r--r--fs/partitions/ibm.h2
-rw-r--r--fs/partitions/karma.c4
-rw-r--r--fs/partitions/karma.h2
-rw-r--r--fs/partitions/ldm.c107
-rw-r--r--fs/partitions/ldm.h2
-rw-r--r--fs/partitions/mac.c13
-rw-r--r--fs/partitions/mac.h2
-rw-r--r--fs/partitions/msdos.c146
-rw-r--r--fs/partitions/msdos.h2
-rw-r--r--fs/partitions/osf.c4
-rw-r--r--fs/partitions/osf.h2
-rw-r--r--fs/partitions/sgi.c6
-rw-r--r--fs/partitions/sgi.h2
-rw-r--r--fs/partitions/sun.c6
-rw-r--r--fs/partitions/sun.h2
-rw-r--r--fs/partitions/sysv68.c6
-rw-r--r--fs/partitions/sysv68.h2
-rw-r--r--fs/partitions/ultrix.c4
-rw-r--r--fs/partitions/ultrix.h2
-rw-r--r--fs/pipe.c133
-rw-r--r--fs/proc/array.c8
-rw-r--r--fs/proc/base.c34
-rw-r--r--fs/proc/generic.c16
-rw-r--r--fs/proc/inode.c5
-rw-r--r--fs/proc/kcore.c6
-rw-r--r--fs/proc/kmsg.c1
-rw-r--r--fs/proc/nommu.c1
-rw-r--r--fs/proc/proc_devtree.c1
-rw-r--r--fs/proc/proc_net.c1
-rw-r--r--fs/proc/root.c1
-rw-r--r--fs/proc/stat.c1
-rw-r--r--fs/proc/task_mmu.c138
-rw-r--r--fs/proc/task_nommu.c1
-rw-r--r--fs/proc/vmcore.c2
-rw-r--r--fs/qnx4/dir.c3
-rw-r--r--fs/quota/Kconfig8
-rw-r--r--fs/quota/dquot.c436
-rw-r--r--fs/quota/netlink.c1
-rw-r--r--fs/quota/quota.c99
-rw-r--r--fs/quota/quota_tree.c50
-rw-r--r--fs/quota/quota_tree.h6
-rw-r--r--fs/quota/quota_v1.c4
-rw-r--r--fs/quota/quota_v2.c6
-rw-r--r--fs/ramfs/file-mmu.c3
-rw-r--r--fs/ramfs/file-nommu.c10
-rw-r--r--fs/ramfs/inode.c23
-rw-r--r--fs/read_write.c19
-rw-r--r--fs/reiserfs/dir.c12
-rw-r--r--fs/reiserfs/file.c8
-rw-r--r--fs/reiserfs/fix_node.c1
-rw-r--r--fs/reiserfs/inode.c4
-rw-r--r--fs/reiserfs/journal.c16
-rw-r--r--fs/reiserfs/namei.c19
-rw-r--r--fs/reiserfs/super.c59
-rw-r--r--fs/reiserfs/xattr.c36
-rw-r--r--fs/reiserfs/xattr_acl.c5
-rw-r--r--fs/reiserfs/xattr_security.c5
-rw-r--r--fs/reiserfs/xattr_trusted.c2
-rw-r--r--fs/reiserfs/xattr_user.c2
-rw-r--r--fs/signalfd.c1
-rw-r--r--fs/smbfs/dir.c3
-rw-r--r--fs/smbfs/file.c6
-rw-r--r--fs/smbfs/inode.c10
-rw-r--r--fs/smbfs/ioctl.c10
-rw-r--r--fs/smbfs/proto.h2
-rw-r--r--fs/smbfs/smbiod.c1
-rw-r--r--fs/splice.c152
-rw-r--r--fs/squashfs/Kconfig11
-rw-r--r--fs/squashfs/Makefile2
-rw-r--r--fs/squashfs/block.c5
-rw-r--r--fs/squashfs/inode.c92
-rw-r--r--fs/squashfs/namei.c6
-rw-r--r--fs/squashfs/squashfs.h12
-rw-r--r--fs/squashfs/squashfs_fs.h76
-rw-r--r--fs/squashfs/squashfs_fs_i.h3
-rw-r--r--fs/squashfs/squashfs_fs_sb.h3
-rw-r--r--fs/squashfs/super.c34
-rw-r--r--fs/squashfs/symlink.c12
-rw-r--r--fs/squashfs/xattr.c323
-rw-r--r--fs/squashfs/xattr.h46
-rw-r--r--fs/squashfs/xattr_id.c100
-rw-r--r--fs/squashfs/zlib_wrapper.c4
-rw-r--r--fs/statfs.c196
-rw-r--r--fs/super.c343
-rw-r--r--fs/sync.c98
-rw-r--r--fs/sysfs/bin.c26
-rw-r--r--fs/sysfs/dir.c114
-rw-r--r--fs/sysfs/file.c17
-rw-r--r--fs/sysfs/group.c6
-rw-r--r--fs/sysfs/inode.c15
-rw-r--r--fs/sysfs/mount.c96
-rw-r--r--fs/sysfs/symlink.c37
-rw-r--r--fs/sysfs/sysfs.h34
-rw-r--r--fs/sysv/dir.c4
-rw-r--r--fs/sysv/file.c2
-rw-r--r--fs/sysv/ialloc.c11
-rw-r--r--fs/sysv/inode.c1
-rw-r--r--fs/timerfd.c26
-rw-r--r--fs/ubifs/commit.c1
-rw-r--r--fs/ubifs/debug.c1
-rw-r--r--fs/ubifs/dir.c9
-rw-r--r--fs/ubifs/file.c18
-rw-r--r--fs/ubifs/gc.c1
-rw-r--r--fs/ubifs/io.c2
-rw-r--r--fs/ubifs/lpt.c1
-rw-r--r--fs/ubifs/lpt_commit.c1
-rw-r--r--fs/ubifs/recovery.c1
-rw-r--r--fs/ubifs/sb.c1
-rw-r--r--fs/ubifs/tnc.c1
-rw-r--r--fs/ubifs/ubifs.h5
-rw-r--r--fs/ubifs/xattr.c1
-rw-r--r--fs/udf/balloc.c53
-rw-r--r--fs/udf/dir.c5
-rw-r--r--fs/udf/file.c71
-rw-r--r--fs/udf/ialloc.c32
-rw-r--r--fs/udf/inode.c7
-rw-r--r--fs/udf/namei.c35
-rw-r--r--fs/udf/partition.c1
-rw-r--r--fs/udf/super.c13
-rw-r--r--fs/udf/symlink.c1
-rw-r--r--fs/udf/udfdecl.h5
-rw-r--r--fs/udf/unicode.c1
-rw-r--r--fs/ufs/balloc.c24
-rw-r--r--fs/ufs/dir.c2
-rw-r--r--fs/ufs/file.c5
-rw-r--r--fs/ufs/ialloc.c23
-rw-r--r--fs/ufs/inode.c6
-rw-r--r--fs/ufs/namei.c18
-rw-r--r--fs/ufs/super.c112
-rw-r--r--fs/ufs/symlink.c8
-rw-r--r--fs/ufs/truncate.c24
-rw-r--r--fs/ufs/ufs.h2
-rw-r--r--fs/ufs/ufs_fs.h1
-rw-r--r--fs/xattr.c14
-rw-r--r--fs/xattr_acl.c2
-rw-r--r--fs/xfs/Makefile1
-rw-r--r--fs/xfs/linux-2.6/kmem.c1
-rw-r--r--fs/xfs/linux-2.6/xfs_acl.c5
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c232
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c38
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.h2
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c12
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.c5
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl32.c5
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c6
-rw-r--r--fs/xfs/linux-2.6/xfs_quotaops.c9
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c31
-rw-r--r--fs/xfs/linux-2.6/xfs_super.h2
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c207
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.h7
-rw-r--r--fs/xfs/linux-2.6/xfs_trace.c4
-rw-r--r--fs/xfs/linux-2.6/xfs_trace.h233
-rw-r--r--fs/xfs/linux-2.6/xfs_xattr.c8
-rw-r--r--fs/xfs/quota/xfs_dquot.c199
-rw-r--r--fs/xfs/quota/xfs_dquot.h35
-rw-r--r--fs/xfs/quota/xfs_dquot_item.c30
-rw-r--r--fs/xfs/quota/xfs_qm.c609
-rw-r--r--fs/xfs/quota/xfs_qm.h23
-rw-r--r--fs/xfs/quota/xfs_qm_stats.c2
-rw-r--r--fs/xfs/quota/xfs_qm_syscalls.c165
-rw-r--r--fs/xfs/quota/xfs_quota_priv.h102
-rw-r--r--fs/xfs/quota/xfs_trans_dquot.c29
-rw-r--r--fs/xfs/xfs_acl.h4
-rw-r--r--fs/xfs/xfs_ag.h25
-rw-r--r--fs/xfs/xfs_alloc.c357
-rw-r--r--fs/xfs/xfs_alloc.h7
-rw-r--r--fs/xfs/xfs_alloc_btree.c2
-rw-r--r--fs/xfs/xfs_bmap.c2
-rw-r--r--fs/xfs/xfs_buf_item.c221
-rw-r--r--fs/xfs/xfs_buf_item.h20
-rw-r--r--fs/xfs/xfs_dfrag.c22
-rw-r--r--fs/xfs/xfs_error.c32
-rw-r--r--fs/xfs/xfs_error.h9
-rw-r--r--fs/xfs/xfs_extfree_item.c18
-rw-r--r--fs/xfs/xfs_inode.c2
-rw-r--r--fs/xfs/xfs_inode_item.c21
-rw-r--r--fs/xfs/xfs_iomap.c123
-rw-r--r--fs/xfs/xfs_iomap.h47
-rw-r--r--fs/xfs/xfs_log.c834
-rw-r--r--fs/xfs/xfs_log.h27
-rw-r--r--fs/xfs/xfs_log_cil.c725
-rw-r--r--fs/xfs/xfs_log_priv.h130
-rw-r--r--fs/xfs/xfs_log_recover.c355
-rw-r--r--fs/xfs/xfs_log_recover.h2
-rw-r--r--fs/xfs/xfs_mount.c7
-rw-r--r--fs/xfs/xfs_mount.h2
-rw-r--r--fs/xfs/xfs_quota.h3
-rw-r--r--fs/xfs/xfs_trans.c810
-rw-r--r--fs/xfs/xfs_trans.h58
-rw-r--r--fs/xfs/xfs_trans_buf.c233
-rw-r--r--fs/xfs/xfs_trans_item.c114
-rw-r--r--fs/xfs/xfs_trans_priv.h15
-rw-r--r--fs/xfs/xfs_types.h2
773 files changed, 24707 insertions, 15985 deletions
diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index e777961939f3..0dbe0d139ac2 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c
@@ -22,6 +22,7 @@
22 22
23#include <linux/jiffies.h> 23#include <linux/jiffies.h>
24#include <linux/file.h> 24#include <linux/file.h>
25#include <linux/slab.h>
25#include <linux/stat.h> 26#include <linux/stat.h>
26#include <linux/sched.h> 27#include <linux/sched.h>
27#include <linux/fs.h> 28#include <linux/fs.h>
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index 08b2eb157048..7317b39b2815 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -24,6 +24,7 @@
24#include <linux/module.h> 24#include <linux/module.h>
25#include <linux/errno.h> 25#include <linux/errno.h>
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/slab.h>
27#include <linux/sched.h> 28#include <linux/sched.h>
28#include <linux/idr.h> 29#include <linux/idr.h>
29#include <net/9p/9p.h> 30#include <net/9p/9p.h>
@@ -110,7 +111,7 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
110{ 111{
111 int i, n, l, clone, any, access; 112 int i, n, l, clone, any, access;
112 u32 uid; 113 u32 uid;
113 struct p9_fid *fid; 114 struct p9_fid *fid, *old_fid = NULL;
114 struct dentry *d, *ds; 115 struct dentry *d, *ds;
115 struct v9fs_session_info *v9ses; 116 struct v9fs_session_info *v9ses;
116 char **wnames, *uname; 117 char **wnames, *uname;
@@ -183,10 +184,18 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
183 l = min(n - i, P9_MAXWELEM); 184 l = min(n - i, P9_MAXWELEM);
184 fid = p9_client_walk(fid, l, &wnames[i], clone); 185 fid = p9_client_walk(fid, l, &wnames[i], clone);
185 if (IS_ERR(fid)) { 186 if (IS_ERR(fid)) {
187 if (old_fid) {
188 /*
189 * If we fail, clunk fid which are mapping
190 * to path component and not the last component
191 * of the path.
192 */
193 p9_client_clunk(old_fid);
194 }
186 kfree(wnames); 195 kfree(wnames);
187 return fid; 196 return fid;
188 } 197 }
189 198 old_fid = fid;
190 i += l; 199 i += l;
191 clone = 0; 200 clone = 0;
192 } 201 }
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 6c7f6a251115..f8b86e92cd66 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -29,6 +29,7 @@
29#include <linux/sched.h> 29#include <linux/sched.h>
30#include <linux/parser.h> 30#include <linux/parser.h>
31#include <linux/idr.h> 31#include <linux/idr.h>
32#include <linux/slab.h>
32#include <net/9p/9p.h> 33#include <net/9p/9p.h>
33#include <net/9p/client.h> 34#include <net/9p/client.h>
34#include <net/9p/transport.h> 35#include <net/9p/transport.h>
@@ -237,11 +238,18 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
237 return ERR_PTR(-ENOMEM); 238 return ERR_PTR(-ENOMEM);
238 } 239 }
239 240
241 rc = bdi_setup_and_register(&v9ses->bdi, "9p", BDI_CAP_MAP_COPY);
242 if (rc) {
243 __putname(v9ses->aname);
244 __putname(v9ses->uname);
245 return ERR_PTR(rc);
246 }
247
240 spin_lock(&v9fs_sessionlist_lock); 248 spin_lock(&v9fs_sessionlist_lock);
241 list_add(&v9ses->slist, &v9fs_sessionlist); 249 list_add(&v9ses->slist, &v9fs_sessionlist);
242 spin_unlock(&v9fs_sessionlist_lock); 250 spin_unlock(&v9fs_sessionlist_lock);
243 251
244 v9ses->flags = V9FS_PROTO_2000U | V9FS_ACCESS_USER; 252 v9ses->flags = V9FS_ACCESS_USER;
245 strcpy(v9ses->uname, V9FS_DEFUSER); 253 strcpy(v9ses->uname, V9FS_DEFUSER);
246 strcpy(v9ses->aname, V9FS_DEFANAME); 254 strcpy(v9ses->aname, V9FS_DEFANAME);
247 v9ses->uid = ~0; 255 v9ses->uid = ~0;
@@ -262,8 +270,10 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
262 goto error; 270 goto error;
263 } 271 }
264 272
265 if (!p9_is_proto_dotu(v9ses->clnt)) 273 if (p9_is_proto_dotl(v9ses->clnt))
266 v9ses->flags &= ~V9FS_PROTO_2000U; 274 v9ses->flags |= V9FS_PROTO_2000L;
275 else if (p9_is_proto_dotu(v9ses->clnt))
276 v9ses->flags |= V9FS_PROTO_2000U;
267 277
268 v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ; 278 v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ;
269 279
@@ -298,6 +308,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
298 return fid; 308 return fid;
299 309
300error: 310error:
311 bdi_destroy(&v9ses->bdi);
301 return ERR_PTR(retval); 312 return ERR_PTR(retval);
302} 313}
303 314
@@ -323,6 +334,8 @@ void v9fs_session_close(struct v9fs_session_info *v9ses)
323 __putname(v9ses->uname); 334 __putname(v9ses->uname);
324 __putname(v9ses->aname); 335 __putname(v9ses->aname);
325 336
337 bdi_destroy(&v9ses->bdi);
338
326 spin_lock(&v9fs_sessionlist_lock); 339 spin_lock(&v9fs_sessionlist_lock);
327 list_del(&v9ses->slist); 340 list_del(&v9ses->slist);
328 spin_unlock(&v9fs_sessionlist_lock); 341 spin_unlock(&v9fs_sessionlist_lock);
@@ -340,6 +353,19 @@ void v9fs_session_cancel(struct v9fs_session_info *v9ses) {
340 p9_client_disconnect(v9ses->clnt); 353 p9_client_disconnect(v9ses->clnt);
341} 354}
342 355
356/**
357 * v9fs_session_begin_cancel - Begin terminate of a session
358 * @v9ses: session to terminate
359 *
360 * After this call we don't allow any request other than clunk.
361 */
362
363void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses)
364{
365 P9_DPRINTK(P9_DEBUG_ERROR, "begin cancel session %p\n", v9ses);
366 p9_client_begin_disconnect(v9ses->clnt);
367}
368
343extern int v9fs_error_init(void); 369extern int v9fs_error_init(void);
344 370
345static struct kobject *v9fs_kobj; 371static struct kobject *v9fs_kobj;
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 6b801d1ddf4b..bec4d0bcb458 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -20,6 +20,7 @@
20 * Boston, MA 02111-1301 USA 20 * Boston, MA 02111-1301 USA
21 * 21 *
22 */ 22 */
23#include <linux/backing-dev.h>
23 24
24/** 25/**
25 * enum p9_session_flags - option flags for each 9P session 26 * enum p9_session_flags - option flags for each 9P session
@@ -102,12 +103,14 @@ struct v9fs_session_info {
102 u32 uid; /* if ACCESS_SINGLE, the uid that has access */ 103 u32 uid; /* if ACCESS_SINGLE, the uid that has access */
103 struct p9_client *clnt; /* 9p client */ 104 struct p9_client *clnt; /* 9p client */
104 struct list_head slist; /* list of sessions registered with v9fs */ 105 struct list_head slist; /* list of sessions registered with v9fs */
106 struct backing_dev_info bdi;
105}; 107};
106 108
107struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *, 109struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *,
108 char *); 110 char *);
109void v9fs_session_close(struct v9fs_session_info *v9ses); 111void v9fs_session_close(struct v9fs_session_info *v9ses);
110void v9fs_session_cancel(struct v9fs_session_info *v9ses); 112void v9fs_session_cancel(struct v9fs_session_info *v9ses);
113void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses);
111 114
112#define V9FS_MAGIC 0x01021997 115#define V9FS_MAGIC 0x01021997
113 116
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index ed835836e0dc..32ef4009d030 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -40,7 +40,9 @@
40extern struct file_system_type v9fs_fs_type; 40extern struct file_system_type v9fs_fs_type;
41extern const struct address_space_operations v9fs_addr_operations; 41extern const struct address_space_operations v9fs_addr_operations;
42extern const struct file_operations v9fs_file_operations; 42extern const struct file_operations v9fs_file_operations;
43extern const struct file_operations v9fs_file_operations_dotl;
43extern const struct file_operations v9fs_dir_operations; 44extern const struct file_operations v9fs_dir_operations;
45extern const struct file_operations v9fs_dir_operations_dotl;
44extern const struct dentry_operations v9fs_dentry_operations; 46extern const struct dentry_operations v9fs_dentry_operations;
45extern const struct dentry_operations v9fs_cached_dentry_operations; 47extern const struct dentry_operations v9fs_cached_dentry_operations;
46 48
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index d74325295b1e..cbf4e50f3933 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -34,6 +34,7 @@
34#include <linux/namei.h> 34#include <linux/namei.h>
35#include <linux/idr.h> 35#include <linux/idr.h>
36#include <linux/sched.h> 36#include <linux/sched.h>
37#include <linux/slab.h>
37#include <net/9p/9p.h> 38#include <net/9p/9p.h>
38#include <net/9p/client.h> 39#include <net/9p/client.h>
39 40
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index d8a3afe4ff72..d61e3b28ce37 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -32,6 +32,7 @@
32#include <linux/sched.h> 32#include <linux/sched.h>
33#include <linux/inet.h> 33#include <linux/inet.h>
34#include <linux/idr.h> 34#include <linux/idr.h>
35#include <linux/slab.h>
35#include <net/9p/9p.h> 36#include <net/9p/9p.h>
36#include <net/9p/client.h> 37#include <net/9p/client.h>
37 38
@@ -130,6 +131,8 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
130 rdir = (struct p9_rdir *) fid->rdir; 131 rdir = (struct p9_rdir *) fid->rdir;
131 132
132 err = mutex_lock_interruptible(&rdir->mutex); 133 err = mutex_lock_interruptible(&rdir->mutex);
134 if (err)
135 return err;
133 while (err == 0) { 136 while (err == 0) {
134 if (rdir->tail == rdir->head) { 137 if (rdir->tail == rdir->head) {
135 err = v9fs_file_readn(filp, rdir->buf, NULL, 138 err = v9fs_file_readn(filp, rdir->buf, NULL,
@@ -200,3 +203,11 @@ const struct file_operations v9fs_dir_operations = {
200 .open = v9fs_file_open, 203 .open = v9fs_file_open,
201 .release = v9fs_dir_release, 204 .release = v9fs_dir_release,
202}; 205};
206
207const struct file_operations v9fs_dir_operations_dotl = {
208 .read = generic_read_dir,
209 .llseek = generic_file_llseek,
210 .readdir = v9fs_dir_readdir,
211 .open = v9fs_file_open,
212 .release = v9fs_dir_release,
213};
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index df52d488d2a6..2bedc6c94fc2 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -257,15 +257,13 @@ v9fs_file_write(struct file *filp, const char __user * data,
257 return total; 257 return total;
258} 258}
259 259
260static int v9fs_file_fsync(struct file *filp, struct dentry *dentry, 260static int v9fs_file_fsync(struct file *filp, int datasync)
261 int datasync)
262{ 261{
263 struct p9_fid *fid; 262 struct p9_fid *fid;
264 struct p9_wstat wstat; 263 struct p9_wstat wstat;
265 int retval; 264 int retval;
266 265
267 P9_DPRINTK(P9_DEBUG_VFS, "filp %p dentry %p datasync %x\n", filp, 266 P9_DPRINTK(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync);
268 dentry, datasync);
269 267
270 fid = filp->private_data; 268 fid = filp->private_data;
271 v9fs_blank_wstat(&wstat); 269 v9fs_blank_wstat(&wstat);
@@ -296,3 +294,14 @@ const struct file_operations v9fs_file_operations = {
296 .mmap = generic_file_readonly_mmap, 294 .mmap = generic_file_readonly_mmap,
297 .fsync = v9fs_file_fsync, 295 .fsync = v9fs_file_fsync,
298}; 296};
297
298const struct file_operations v9fs_file_operations_dotl = {
299 .llseek = generic_file_llseek,
300 .read = v9fs_file_read,
301 .write = v9fs_file_write,
302 .open = v9fs_file_open,
303 .release = v9fs_dir_release,
304 .lock = v9fs_file_lock,
305 .mmap = generic_file_readonly_mmap,
306 .fsync = v9fs_file_fsync,
307};
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 5fe45d692c9f..4331b3b5ee1c 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -34,6 +34,7 @@
34#include <linux/namei.h> 34#include <linux/namei.h>
35#include <linux/idr.h> 35#include <linux/idr.h>
36#include <linux/sched.h> 36#include <linux/sched.h>
37#include <linux/slab.h>
37#include <net/9p/9p.h> 38#include <net/9p/9p.h>
38#include <net/9p/client.h> 39#include <net/9p/client.h>
39 40
@@ -43,9 +44,12 @@
43#include "cache.h" 44#include "cache.h"
44 45
45static const struct inode_operations v9fs_dir_inode_operations; 46static const struct inode_operations v9fs_dir_inode_operations;
46static const struct inode_operations v9fs_dir_inode_operations_ext; 47static const struct inode_operations v9fs_dir_inode_operations_dotu;
48static const struct inode_operations v9fs_dir_inode_operations_dotl;
47static const struct inode_operations v9fs_file_inode_operations; 49static const struct inode_operations v9fs_file_inode_operations;
50static const struct inode_operations v9fs_file_inode_operations_dotl;
48static const struct inode_operations v9fs_symlink_inode_operations; 51static const struct inode_operations v9fs_symlink_inode_operations;
52static const struct inode_operations v9fs_symlink_inode_operations_dotl;
49 53
50/** 54/**
51 * unixmode2p9mode - convert unix mode bits to plan 9 55 * unixmode2p9mode - convert unix mode bits to plan 9
@@ -252,9 +256,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
252 return ERR_PTR(-ENOMEM); 256 return ERR_PTR(-ENOMEM);
253 } 257 }
254 258
255 inode->i_mode = mode; 259 inode_init_owner(inode, NULL, mode);
256 inode->i_uid = current_fsuid();
257 inode->i_gid = current_fsgid();
258 inode->i_blocks = 0; 260 inode->i_blocks = 0;
259 inode->i_rdev = 0; 261 inode->i_rdev = 0;
260 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 262 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
@@ -274,25 +276,44 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
274 init_special_inode(inode, inode->i_mode, inode->i_rdev); 276 init_special_inode(inode, inode->i_mode, inode->i_rdev);
275 break; 277 break;
276 case S_IFREG: 278 case S_IFREG:
277 inode->i_op = &v9fs_file_inode_operations; 279 if (v9fs_proto_dotl(v9ses)) {
278 inode->i_fop = &v9fs_file_operations; 280 inode->i_op = &v9fs_file_inode_operations_dotl;
281 inode->i_fop = &v9fs_file_operations_dotl;
282 } else {
283 inode->i_op = &v9fs_file_inode_operations;
284 inode->i_fop = &v9fs_file_operations;
285 }
286
279 break; 287 break;
288
280 case S_IFLNK: 289 case S_IFLNK:
281 if (!v9fs_proto_dotu(v9ses)) { 290 if (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses)) {
282 P9_DPRINTK(P9_DEBUG_ERROR, 291 P9_DPRINTK(P9_DEBUG_ERROR, "extended modes used with "
283 "extended modes used w/o 9P2000.u\n"); 292 "legacy protocol.\n");
284 err = -EINVAL; 293 err = -EINVAL;
285 goto error; 294 goto error;
286 } 295 }
287 inode->i_op = &v9fs_symlink_inode_operations; 296
297 if (v9fs_proto_dotl(v9ses))
298 inode->i_op = &v9fs_symlink_inode_operations_dotl;
299 else
300 inode->i_op = &v9fs_symlink_inode_operations;
301
288 break; 302 break;
289 case S_IFDIR: 303 case S_IFDIR:
290 inc_nlink(inode); 304 inc_nlink(inode);
291 if (v9fs_proto_dotu(v9ses)) 305 if (v9fs_proto_dotl(v9ses))
292 inode->i_op = &v9fs_dir_inode_operations_ext; 306 inode->i_op = &v9fs_dir_inode_operations_dotl;
307 else if (v9fs_proto_dotu(v9ses))
308 inode->i_op = &v9fs_dir_inode_operations_dotu;
293 else 309 else
294 inode->i_op = &v9fs_dir_inode_operations; 310 inode->i_op = &v9fs_dir_inode_operations;
295 inode->i_fop = &v9fs_dir_operations; 311
312 if (v9fs_proto_dotl(v9ses))
313 inode->i_fop = &v9fs_dir_operations_dotl;
314 else
315 inode->i_fop = &v9fs_dir_operations;
316
296 break; 317 break;
297 default: 318 default:
298 P9_DPRINTK(P9_DEBUG_ERROR, "BAD mode 0x%x S_IFMT 0x%x\n", 319 P9_DPRINTK(P9_DEBUG_ERROR, "BAD mode 0x%x S_IFMT 0x%x\n",
@@ -431,20 +452,22 @@ error:
431 452
432static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir) 453static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
433{ 454{
455 int retval;
434 struct inode *file_inode; 456 struct inode *file_inode;
435 struct v9fs_session_info *v9ses;
436 struct p9_fid *v9fid; 457 struct p9_fid *v9fid;
437 458
438 P9_DPRINTK(P9_DEBUG_VFS, "inode: %p dentry: %p rmdir: %d\n", dir, file, 459 P9_DPRINTK(P9_DEBUG_VFS, "inode: %p dentry: %p rmdir: %d\n", dir, file,
439 rmdir); 460 rmdir);
440 461
441 file_inode = file->d_inode; 462 file_inode = file->d_inode;
442 v9ses = v9fs_inode2v9ses(file_inode);
443 v9fid = v9fs_fid_clone(file); 463 v9fid = v9fs_fid_clone(file);
444 if (IS_ERR(v9fid)) 464 if (IS_ERR(v9fid))
445 return PTR_ERR(v9fid); 465 return PTR_ERR(v9fid);
446 466
447 return p9_client_remove(v9fid); 467 retval = p9_client_remove(v9fid);
468 if (!retval)
469 drop_nlink(file_inode);
470 return retval;
448} 471}
449 472
450static int 473static int
@@ -479,12 +502,11 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
479 ofid = NULL; 502 ofid = NULL;
480 fid = NULL; 503 fid = NULL;
481 name = (char *) dentry->d_name.name; 504 name = (char *) dentry->d_name.name;
482 dfid = v9fs_fid_clone(dentry->d_parent); 505 dfid = v9fs_fid_lookup(dentry->d_parent);
483 if (IS_ERR(dfid)) { 506 if (IS_ERR(dfid)) {
484 err = PTR_ERR(dfid); 507 err = PTR_ERR(dfid);
485 P9_DPRINTK(P9_DEBUG_VFS, "fid clone failed %d\n", err); 508 P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
486 dfid = NULL; 509 return ERR_PTR(err);
487 goto error;
488 } 510 }
489 511
490 /* clone a fid to use for creation */ 512 /* clone a fid to use for creation */
@@ -492,8 +514,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
492 if (IS_ERR(ofid)) { 514 if (IS_ERR(ofid)) {
493 err = PTR_ERR(ofid); 515 err = PTR_ERR(ofid);
494 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); 516 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
495 ofid = NULL; 517 return ERR_PTR(err);
496 goto error;
497 } 518 }
498 519
499 err = p9_client_fcreate(ofid, name, perm, mode, extension); 520 err = p9_client_fcreate(ofid, name, perm, mode, extension);
@@ -503,14 +524,13 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
503 } 524 }
504 525
505 /* now walk from the parent so we can get unopened fid */ 526 /* now walk from the parent so we can get unopened fid */
506 fid = p9_client_walk(dfid, 1, &name, 0); 527 fid = p9_client_walk(dfid, 1, &name, 1);
507 if (IS_ERR(fid)) { 528 if (IS_ERR(fid)) {
508 err = PTR_ERR(fid); 529 err = PTR_ERR(fid);
509 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); 530 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
510 fid = NULL; 531 fid = NULL;
511 goto error; 532 goto error;
512 } else 533 }
513 dfid = NULL;
514 534
515 /* instantiate inode and assign the unopened fid to the dentry */ 535 /* instantiate inode and assign the unopened fid to the dentry */
516 inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb); 536 inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
@@ -533,9 +553,6 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
533 return ofid; 553 return ofid;
534 554
535error: 555error:
536 if (dfid)
537 p9_client_clunk(dfid);
538
539 if (ofid) 556 if (ofid)
540 p9_client_clunk(ofid); 557 p9_client_clunk(ofid);
541 558
@@ -656,6 +673,9 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
656 P9_DPRINTK(P9_DEBUG_VFS, "dir: %p dentry: (%s) %p nameidata: %p\n", 673 P9_DPRINTK(P9_DEBUG_VFS, "dir: %p dentry: (%s) %p nameidata: %p\n",
657 dir, dentry->d_name.name, dentry, nameidata); 674 dir, dentry->d_name.name, dentry, nameidata);
658 675
676 if (dentry->d_name.len > NAME_MAX)
677 return ERR_PTR(-ENAMETOOLONG);
678
659 sb = dir->i_sb; 679 sb = dir->i_sb;
660 v9ses = v9fs_inode2v9ses(dir); 680 v9ses = v9fs_inode2v9ses(dir);
661 dfid = v9fs_fid_lookup(dentry->d_parent); 681 dfid = v9fs_fid_lookup(dentry->d_parent);
@@ -667,8 +687,8 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
667 if (IS_ERR(fid)) { 687 if (IS_ERR(fid)) {
668 result = PTR_ERR(fid); 688 result = PTR_ERR(fid);
669 if (result == -ENOENT) { 689 if (result == -ENOENT) {
670 d_add(dentry, NULL); 690 inode = NULL;
671 return NULL; 691 goto inst_out;
672 } 692 }
673 693
674 return ERR_PTR(result); 694 return ERR_PTR(result);
@@ -685,7 +705,8 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
685 if (result < 0) 705 if (result < 0)
686 goto error; 706 goto error;
687 707
688 if ((fid->qid.version) && (v9ses->cache)) 708inst_out:
709 if (v9ses->cache)
689 dentry->d_op = &v9fs_cached_dentry_operations; 710 dentry->d_op = &v9fs_cached_dentry_operations;
690 else 711 else
691 dentry->d_op = &v9fs_dentry_operations; 712 dentry->d_op = &v9fs_dentry_operations;
@@ -764,6 +785,13 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
764 goto clunk_olddir; 785 goto clunk_olddir;
765 } 786 }
766 787
788 if (v9fs_proto_dotl(v9ses)) {
789 retval = p9_client_rename(oldfid, newdirfid,
790 (char *) new_dentry->d_name.name);
791 if (retval != -ENOSYS)
792 goto clunk_newdir;
793 }
794
767 /* 9P can only handle file rename in the same directory */ 795 /* 9P can only handle file rename in the same directory */
768 if (memcmp(&olddirfid->qid, &newdirfid->qid, sizeof(newdirfid->qid))) { 796 if (memcmp(&olddirfid->qid, &newdirfid->qid, sizeof(newdirfid->qid))) {
769 P9_DPRINTK(P9_DEBUG_ERROR, 797 P9_DPRINTK(P9_DEBUG_ERROR,
@@ -1189,6 +1217,8 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
1189 sprintf(name, "c %u %u", MAJOR(rdev), MINOR(rdev)); 1217 sprintf(name, "c %u %u", MAJOR(rdev), MINOR(rdev));
1190 else if (S_ISFIFO(mode)) 1218 else if (S_ISFIFO(mode))
1191 *name = 0; 1219 *name = 0;
1220 else if (S_ISSOCK(mode))
1221 *name = 0;
1192 else { 1222 else {
1193 __putname(name); 1223 __putname(name);
1194 return -EINVAL; 1224 return -EINVAL;
@@ -1200,7 +1230,21 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
1200 return retval; 1230 return retval;
1201} 1231}
1202 1232
1203static const struct inode_operations v9fs_dir_inode_operations_ext = { 1233static const struct inode_operations v9fs_dir_inode_operations_dotu = {
1234 .create = v9fs_vfs_create,
1235 .lookup = v9fs_vfs_lookup,
1236 .symlink = v9fs_vfs_symlink,
1237 .link = v9fs_vfs_link,
1238 .unlink = v9fs_vfs_unlink,
1239 .mkdir = v9fs_vfs_mkdir,
1240 .rmdir = v9fs_vfs_rmdir,
1241 .mknod = v9fs_vfs_mknod,
1242 .rename = v9fs_vfs_rename,
1243 .getattr = v9fs_vfs_getattr,
1244 .setattr = v9fs_vfs_setattr,
1245};
1246
1247static const struct inode_operations v9fs_dir_inode_operations_dotl = {
1204 .create = v9fs_vfs_create, 1248 .create = v9fs_vfs_create,
1205 .lookup = v9fs_vfs_lookup, 1249 .lookup = v9fs_vfs_lookup,
1206 .symlink = v9fs_vfs_symlink, 1250 .symlink = v9fs_vfs_symlink,
@@ -1231,6 +1275,11 @@ static const struct inode_operations v9fs_file_inode_operations = {
1231 .setattr = v9fs_vfs_setattr, 1275 .setattr = v9fs_vfs_setattr,
1232}; 1276};
1233 1277
1278static const struct inode_operations v9fs_file_inode_operations_dotl = {
1279 .getattr = v9fs_vfs_getattr,
1280 .setattr = v9fs_vfs_setattr,
1281};
1282
1234static const struct inode_operations v9fs_symlink_inode_operations = { 1283static const struct inode_operations v9fs_symlink_inode_operations = {
1235 .readlink = generic_readlink, 1284 .readlink = generic_readlink,
1236 .follow_link = v9fs_vfs_follow_link, 1285 .follow_link = v9fs_vfs_follow_link,
@@ -1238,3 +1287,11 @@ static const struct inode_operations v9fs_symlink_inode_operations = {
1238 .getattr = v9fs_vfs_getattr, 1287 .getattr = v9fs_vfs_getattr,
1239 .setattr = v9fs_vfs_setattr, 1288 .setattr = v9fs_vfs_setattr,
1240}; 1289};
1290
1291static const struct inode_operations v9fs_symlink_inode_operations_dotl = {
1292 .readlink = generic_readlink,
1293 .follow_link = v9fs_vfs_follow_link,
1294 .put_link = v9fs_vfs_put_link,
1295 .getattr = v9fs_vfs_getattr,
1296 .setattr = v9fs_vfs_setattr,
1297};
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 69357c0d9899..be74d020436e 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -37,6 +37,8 @@
37#include <linux/mount.h> 37#include <linux/mount.h>
38#include <linux/idr.h> 38#include <linux/idr.h>
39#include <linux/sched.h> 39#include <linux/sched.h>
40#include <linux/slab.h>
41#include <linux/statfs.h>
40#include <net/9p/9p.h> 42#include <net/9p/9p.h>
41#include <net/9p/client.h> 43#include <net/9p/client.h>
42 44
@@ -44,7 +46,7 @@
44#include "v9fs_vfs.h" 46#include "v9fs_vfs.h"
45#include "fid.h" 47#include "fid.h"
46 48
47static const struct super_operations v9fs_super_ops; 49static const struct super_operations v9fs_super_ops, v9fs_super_ops_dotl;
48 50
49/** 51/**
50 * v9fs_set_super - set the superblock 52 * v9fs_set_super - set the superblock
@@ -75,7 +77,11 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
75 sb->s_blocksize_bits = fls(v9ses->maxdata - 1); 77 sb->s_blocksize_bits = fls(v9ses->maxdata - 1);
76 sb->s_blocksize = 1 << sb->s_blocksize_bits; 78 sb->s_blocksize = 1 << sb->s_blocksize_bits;
77 sb->s_magic = V9FS_MAGIC; 79 sb->s_magic = V9FS_MAGIC;
78 sb->s_op = &v9fs_super_ops; 80 if (v9fs_proto_dotl(v9ses))
81 sb->s_op = &v9fs_super_ops_dotl;
82 else
83 sb->s_op = &v9fs_super_ops;
84 sb->s_bdi = &v9ses->bdi;
79 85
80 sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC | 86 sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC |
81 MS_NOATIME; 87 MS_NOATIME;
@@ -193,6 +199,7 @@ static void v9fs_kill_super(struct super_block *s)
193 199
194 kill_anon_super(s); 200 kill_anon_super(s);
195 201
202 v9fs_session_cancel(v9ses);
196 v9fs_session_close(v9ses); 203 v9fs_session_close(v9ses);
197 kfree(v9ses); 204 kfree(v9ses);
198 s->s_fs_info = NULL; 205 s->s_fs_info = NULL;
@@ -205,7 +212,43 @@ v9fs_umount_begin(struct super_block *sb)
205 struct v9fs_session_info *v9ses; 212 struct v9fs_session_info *v9ses;
206 213
207 v9ses = sb->s_fs_info; 214 v9ses = sb->s_fs_info;
208 v9fs_session_cancel(v9ses); 215 v9fs_session_begin_cancel(v9ses);
216}
217
218static int v9fs_statfs(struct dentry *dentry, struct kstatfs *buf)
219{
220 struct v9fs_session_info *v9ses;
221 struct p9_fid *fid;
222 struct p9_rstatfs rs;
223 int res;
224
225 fid = v9fs_fid_lookup(dentry);
226 if (IS_ERR(fid)) {
227 res = PTR_ERR(fid);
228 goto done;
229 }
230
231 v9ses = v9fs_inode2v9ses(dentry->d_inode);
232 if (v9fs_proto_dotl(v9ses)) {
233 res = p9_client_statfs(fid, &rs);
234 if (res == 0) {
235 buf->f_type = rs.type;
236 buf->f_bsize = rs.bsize;
237 buf->f_blocks = rs.blocks;
238 buf->f_bfree = rs.bfree;
239 buf->f_bavail = rs.bavail;
240 buf->f_files = rs.files;
241 buf->f_ffree = rs.ffree;
242 buf->f_fsid.val[0] = rs.fsid & 0xFFFFFFFFUL;
243 buf->f_fsid.val[1] = (rs.fsid >> 32) & 0xFFFFFFFFUL;
244 buf->f_namelen = rs.namelen;
245 }
246 if (res != -ENOSYS)
247 goto done;
248 }
249 res = simple_statfs(dentry, buf);
250done:
251 return res;
209} 252}
210 253
211static const struct super_operations v9fs_super_ops = { 254static const struct super_operations v9fs_super_ops = {
@@ -219,6 +262,17 @@ static const struct super_operations v9fs_super_ops = {
219 .umount_begin = v9fs_umount_begin, 262 .umount_begin = v9fs_umount_begin,
220}; 263};
221 264
265static const struct super_operations v9fs_super_ops_dotl = {
266#ifdef CONFIG_9P_FSCACHE
267 .alloc_inode = v9fs_alloc_inode,
268 .destroy_inode = v9fs_destroy_inode,
269#endif
270 .statfs = v9fs_statfs,
271 .clear_inode = v9fs_clear_inode,
272 .show_options = generic_show_options,
273 .umount_begin = v9fs_umount_begin,
274};
275
222struct file_system_type v9fs_fs_type = { 276struct file_system_type v9fs_fs_type = {
223 .name = "9p", 277 .name = "9p",
224 .get_sb = v9fs_get_sb, 278 .get_sb = v9fs_get_sb,
diff --git a/fs/Makefile b/fs/Makefile
index 97f340f14ba2..e6ec1d309b1d 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -11,7 +11,7 @@ obj-y := open.o read_write.o file_table.o super.o \
11 attr.o bad_inode.o file.o filesystems.o namespace.o \ 11 attr.o bad_inode.o file.o filesystems.o namespace.o \
12 seq_file.o xattr.o libfs.o fs-writeback.o \ 12 seq_file.o xattr.o libfs.o fs-writeback.o \
13 pnode.o drop_caches.o splice.o sync.o utimes.o \ 13 pnode.o drop_caches.o splice.o sync.o utimes.o \
14 stack.o fs_struct.o 14 stack.o fs_struct.o statfs.o
15 15
16ifeq ($(CONFIG_BLOCK),y) 16ifeq ($(CONFIG_BLOCK),y)
17obj-y += buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o 17obj-y += buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index 23aa52f548a0..f4287e4de744 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -197,7 +197,7 @@ const struct file_operations adfs_dir_operations = {
197 .read = generic_read_dir, 197 .read = generic_read_dir,
198 .llseek = generic_file_llseek, 198 .llseek = generic_file_llseek,
199 .readdir = adfs_readdir, 199 .readdir = adfs_readdir,
200 .fsync = simple_fsync, 200 .fsync = generic_file_fsync,
201}; 201};
202 202
203static int 203static int
diff --git a/fs/adfs/file.c b/fs/adfs/file.c
index 005ea34d1758..a36da5382b40 100644
--- a/fs/adfs/file.c
+++ b/fs/adfs/file.c
@@ -26,7 +26,7 @@ const struct file_operations adfs_file_operations = {
26 .read = do_sync_read, 26 .read = do_sync_read,
27 .aio_read = generic_file_aio_read, 27 .aio_read = generic_file_aio_read,
28 .mmap = generic_file_mmap, 28 .mmap = generic_file_mmap,
29 .fsync = simple_fsync, 29 .fsync = generic_file_fsync,
30 .write = do_sync_write, 30 .write = do_sync_write,
31 .aio_write = generic_file_aio_write, 31 .aio_write = generic_file_aio_write,
32 .splice_read = generic_file_splice_read, 32 .splice_read = generic_file_splice_read,
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 0f5e30978135..6f850b06ab62 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -322,8 +322,9 @@ adfs_notify_change(struct dentry *dentry, struct iattr *attr)
322 if (error) 322 if (error)
323 goto out; 323 goto out;
324 324
325 /* XXX: this is missing some actual on-disk truncation.. */
325 if (ia_valid & ATTR_SIZE) 326 if (ia_valid & ATTR_SIZE)
326 error = vmtruncate(inode, attr->ia_size); 327 error = simple_setsize(inode, attr->ia_size);
327 328
328 if (error) 329 if (error)
329 goto out; 330 goto out;
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 6910a98bd73c..4a3af7075c1d 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -13,6 +13,7 @@
13#include <linux/parser.h> 13#include <linux/parser.h>
14#include <linux/mount.h> 14#include <linux/mount.h>
15#include <linux/seq_file.h> 15#include <linux/seq_file.h>
16#include <linux/slab.h>
16#include <linux/smp_lock.h> 17#include <linux/smp_lock.h>
17#include <linux/statfs.h> 18#include <linux/statfs.h>
18#include "adfs.h" 19#include "adfs.h"
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index 861dae68ac12..f05b6155ccc8 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -183,7 +183,7 @@ extern int affs_add_entry(struct inode *dir, struct inode *inode, struct dent
183 183
184void affs_free_prealloc(struct inode *inode); 184void affs_free_prealloc(struct inode *inode);
185extern void affs_truncate(struct inode *); 185extern void affs_truncate(struct inode *);
186int affs_file_fsync(struct file *, struct dentry *, int); 186int affs_file_fsync(struct file *, int);
187 187
188/* dir.c */ 188/* dir.c */
189 189
diff --git a/fs/affs/bitmap.c b/fs/affs/bitmap.c
index 8306d53307ed..3e262711ae06 100644
--- a/fs/affs/bitmap.c
+++ b/fs/affs/bitmap.c
@@ -7,6 +7,7 @@
7 * block allocation, deallocation, calculation of free space. 7 * block allocation, deallocation, calculation of free space.
8 */ 8 */
9 9
10#include <linux/slab.h>
10#include "affs.h" 11#include "affs.h"
11 12
12/* This is, of course, shamelessly stolen from fs/minix */ 13/* This is, of course, shamelessly stolen from fs/minix */
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 184e55c1c9ba..322710c3eedf 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -916,9 +916,9 @@ affs_truncate(struct inode *inode)
916 affs_free_prealloc(inode); 916 affs_free_prealloc(inode);
917} 917}
918 918
919int affs_file_fsync(struct file *filp, struct dentry *dentry, int datasync) 919int affs_file_fsync(struct file *filp, int datasync)
920{ 920{
921 struct inode * inode = dentry->d_inode; 921 struct inode *inode = filp->f_mapping->host;
922 int ret, err; 922 int ret, err;
923 923
924 ret = write_inode_now(inode, 0); 924 ret = write_inode_now(inode, 0);
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index c9744d771d98..f4b2a4ee4f91 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -10,6 +10,7 @@
10 * (C) 1991 Linus Torvalds - minix filesystem 10 * (C) 1991 Linus Torvalds - minix filesystem
11 */ 11 */
12#include <linux/sched.h> 12#include <linux/sched.h>
13#include <linux/gfp.h>
13#include "affs.h" 14#include "affs.h"
14 15
15extern const struct inode_operations affs_symlink_inode_operations; 16extern const struct inode_operations affs_symlink_inode_operations;
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index d70bbbac6b7b..914d1c0bc07a 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -224,7 +224,7 @@ affs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
224 affs_brelse(bh); 224 affs_brelse(bh);
225 inode = affs_iget(sb, ino); 225 inode = affs_iget(sb, ino);
226 if (IS_ERR(inode)) 226 if (IS_ERR(inode))
227 return ERR_PTR(PTR_ERR(inode)); 227 return ERR_CAST(inode);
228 } 228 }
229 dentry->d_op = AFFS_SB(sb)->s_flags & SF_INTL ? &affs_intl_dentry_operations : &affs_dentry_operations; 229 dentry->d_op = AFFS_SB(sb)->s_flags & SF_INTL ? &affs_intl_dentry_operations : &affs_dentry_operations;
230 d_add(dentry, inode); 230 d_add(dentry, inode);
diff --git a/fs/affs/super.c b/fs/affs/super.c
index d41e9673cd97..16a3e4765f68 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -17,6 +17,7 @@
17#include <linux/magic.h> 17#include <linux/magic.h>
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/smp_lock.h> 19#include <linux/smp_lock.h>
20#include <linux/slab.h>
20#include "affs.h" 21#include "affs.h"
21 22
22extern struct timezone sys_tz; 23extern struct timezone sys_tz;
diff --git a/fs/afs/cache.c b/fs/afs/cache.c
index e2b1d3f16519..0fb315dd4d2a 100644
--- a/fs/afs/cache.c
+++ b/fs/afs/cache.c
@@ -9,7 +9,6 @@
9 * 2 of the License, or (at your option) any later version. 9 * 2 of the License, or (at your option) any later version.
10 */ 10 */
11 11
12#include <linux/slab.h>
13#include <linux/sched.h> 12#include <linux/sched.h>
14#include "internal.h" 13#include "internal.h"
15 14
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index eb765489164f..a3bcec75c54a 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -11,6 +11,7 @@
11 11
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/init.h> 13#include <linux/init.h>
14#include <linux/slab.h>
14#include <linux/sched.h> 15#include <linux/sched.h>
15#include <linux/ip.h> 16#include <linux/ip.h>
16#include "internal.h" 17#include "internal.h"
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 88067f36e5e7..b42d5cc1d6d2 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -12,7 +12,6 @@
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/slab.h>
16#include <linux/fs.h> 15#include <linux/fs.h>
17#include <linux/pagemap.h> 16#include <linux/pagemap.h>
18#include <linux/ctype.h> 17#include <linux/ctype.h>
@@ -190,13 +189,9 @@ static struct page *afs_dir_get_page(struct inode *dir, unsigned long index,
190 struct key *key) 189 struct key *key)
191{ 190{
192 struct page *page; 191 struct page *page;
193 struct file file = {
194 .private_data = key,
195 };
196
197 _enter("{%lu},%lu", dir->i_ino, index); 192 _enter("{%lu},%lu", dir->i_ino, index);
198 193
199 page = read_mapping_page(dir->i_mapping, index, &file); 194 page = read_cache_page(dir->i_mapping, index, afs_page_filler, key);
200 if (!IS_ERR(page)) { 195 if (!IS_ERR(page)) {
201 kmap(page); 196 kmap(page);
202 if (!PageChecked(page)) 197 if (!PageChecked(page))
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 39b301662f22..14d89fa58fee 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -12,10 +12,10 @@
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/slab.h>
16#include <linux/fs.h> 15#include <linux/fs.h>
17#include <linux/pagemap.h> 16#include <linux/pagemap.h>
18#include <linux/writeback.h> 17#include <linux/writeback.h>
18#include <linux/gfp.h>
19#include "internal.h" 19#include "internal.h"
20 20
21static int afs_readpage(struct file *file, struct page *page); 21static int afs_readpage(struct file *file, struct page *page);
@@ -121,34 +121,19 @@ static void afs_file_readpage_read_complete(struct page *page,
121#endif 121#endif
122 122
123/* 123/*
124 * AFS read page from file, directory or symlink 124 * read page from file, directory or symlink, given a key to use
125 */ 125 */
126static int afs_readpage(struct file *file, struct page *page) 126int afs_page_filler(void *data, struct page *page)
127{ 127{
128 struct afs_vnode *vnode; 128 struct inode *inode = page->mapping->host;
129 struct inode *inode; 129 struct afs_vnode *vnode = AFS_FS_I(inode);
130 struct key *key; 130 struct key *key = data;
131 size_t len; 131 size_t len;
132 off_t offset; 132 off_t offset;
133 int ret; 133 int ret;
134 134
135 inode = page->mapping->host;
136
137 if (file) {
138 key = file->private_data;
139 ASSERT(key != NULL);
140 } else {
141 key = afs_request_key(AFS_FS_S(inode->i_sb)->volume->cell);
142 if (IS_ERR(key)) {
143 ret = PTR_ERR(key);
144 goto error_nokey;
145 }
146 }
147
148 _enter("{%x},{%lu},{%lu}", key_serial(key), inode->i_ino, page->index); 135 _enter("{%x},{%lu},{%lu}", key_serial(key), inode->i_ino, page->index);
149 136
150 vnode = AFS_FS_I(inode);
151
152 BUG_ON(!PageLocked(page)); 137 BUG_ON(!PageLocked(page));
153 138
154 ret = -ESTALE; 139 ret = -ESTALE;
@@ -214,31 +199,56 @@ static int afs_readpage(struct file *file, struct page *page)
214 unlock_page(page); 199 unlock_page(page);
215 } 200 }
216 201
217 if (!file)
218 key_put(key);
219 _leave(" = 0"); 202 _leave(" = 0");
220 return 0; 203 return 0;
221 204
222error: 205error:
223 SetPageError(page); 206 SetPageError(page);
224 unlock_page(page); 207 unlock_page(page);
225 if (!file)
226 key_put(key);
227error_nokey:
228 _leave(" = %d", ret); 208 _leave(" = %d", ret);
229 return ret; 209 return ret;
230} 210}
231 211
232/* 212/*
213 * read page from file, directory or symlink, given a file to nominate the key
214 * to be used
215 */
216static int afs_readpage(struct file *file, struct page *page)
217{
218 struct key *key;
219 int ret;
220
221 if (file) {
222 key = file->private_data;
223 ASSERT(key != NULL);
224 ret = afs_page_filler(key, page);
225 } else {
226 struct inode *inode = page->mapping->host;
227 key = afs_request_key(AFS_FS_S(inode->i_sb)->volume->cell);
228 if (IS_ERR(key)) {
229 ret = PTR_ERR(key);
230 } else {
231 ret = afs_page_filler(key, page);
232 key_put(key);
233 }
234 }
235 return ret;
236}
237
238/*
233 * read a set of pages 239 * read a set of pages
234 */ 240 */
235static int afs_readpages(struct file *file, struct address_space *mapping, 241static int afs_readpages(struct file *file, struct address_space *mapping,
236 struct list_head *pages, unsigned nr_pages) 242 struct list_head *pages, unsigned nr_pages)
237{ 243{
244 struct key *key = file->private_data;
238 struct afs_vnode *vnode; 245 struct afs_vnode *vnode;
239 int ret = 0; 246 int ret = 0;
240 247
241 _enter(",{%lu},,%d", mapping->host->i_ino, nr_pages); 248 _enter("{%d},{%lu},,%d",
249 key_serial(key), mapping->host->i_ino, nr_pages);
250
251 ASSERT(key != NULL);
242 252
243 vnode = AFS_FS_I(mapping->host); 253 vnode = AFS_FS_I(mapping->host);
244 if (vnode->flags & AFS_VNODE_DELETED) { 254 if (vnode->flags & AFS_VNODE_DELETED) {
@@ -279,7 +289,7 @@ static int afs_readpages(struct file *file, struct address_space *mapping,
279 } 289 }
280 290
281 /* load the missing pages from the network */ 291 /* load the missing pages from the network */
282 ret = read_cache_pages(mapping, pages, (void *) afs_readpage, file); 292 ret = read_cache_pages(mapping, pages, afs_page_filler, key);
283 293
284 _leave(" = %d [netting]", ret); 294 _leave(" = %d [netting]", ret);
285 return ret; 295 return ret;
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 023b95b0d9d7..4bd0218473a9 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -10,6 +10,7 @@
10 */ 10 */
11 11
12#include <linux/init.h> 12#include <linux/init.h>
13#include <linux/slab.h>
13#include <linux/sched.h> 14#include <linux/sched.h>
14#include <linux/circ_buf.h> 15#include <linux/circ_buf.h>
15#include "internal.h" 16#include "internal.h"
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index c048f0658751..d00b312e3110 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -16,7 +16,6 @@
16#include <linux/kernel.h> 16#include <linux/kernel.h>
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/init.h> 18#include <linux/init.h>
19#include <linux/slab.h>
20#include <linux/fs.h> 19#include <linux/fs.h>
21#include <linux/pagemap.h> 20#include <linux/pagemap.h>
22#include <linux/sched.h> 21#include <linux/sched.h>
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index c54dad4e6063..5f679b77ce24 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -19,6 +19,7 @@
19#include <linux/workqueue.h> 19#include <linux/workqueue.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/fscache.h> 21#include <linux/fscache.h>
22#include <linux/backing-dev.h>
22 23
23#include "afs.h" 24#include "afs.h"
24#include "afs_vl.h" 25#include "afs_vl.h"
@@ -313,6 +314,7 @@ struct afs_volume {
313 unsigned short rjservers; /* number of servers discarded due to -ENOMEDIUM */ 314 unsigned short rjservers; /* number of servers discarded due to -ENOMEDIUM */
314 struct afs_server *servers[8]; /* servers on which volume resides (ordered) */ 315 struct afs_server *servers[8]; /* servers on which volume resides (ordered) */
315 struct rw_semaphore server_sem; /* lock for accessing current server */ 316 struct rw_semaphore server_sem; /* lock for accessing current server */
317 struct backing_dev_info bdi;
316}; 318};
317 319
318/* 320/*
@@ -492,6 +494,7 @@ extern const struct file_operations afs_file_operations;
492 494
493extern int afs_open(struct inode *, struct file *); 495extern int afs_open(struct inode *, struct file *);
494extern int afs_release(struct inode *, struct file *); 496extern int afs_release(struct inode *, struct file *);
497extern int afs_page_filler(void *, struct page *);
495 498
496/* 499/*
497 * flock.c 500 * flock.c
@@ -737,7 +740,7 @@ extern void afs_pages_written_back(struct afs_vnode *, struct afs_call *);
737extern ssize_t afs_file_write(struct kiocb *, const struct iovec *, 740extern ssize_t afs_file_write(struct kiocb *, const struct iovec *,
738 unsigned long, loff_t); 741 unsigned long, loff_t);
739extern int afs_writeback_all(struct afs_vnode *); 742extern int afs_writeback_all(struct afs_vnode *);
740extern int afs_fsync(struct file *, struct dentry *, int); 743extern int afs_fsync(struct file *, int);
741 744
742 745
743/*****************************************************************************/ 746/*****************************************************************************/
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 5ffb570cd3a8..a9e23039ea34 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -12,11 +12,11 @@
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/slab.h>
16#include <linux/fs.h> 15#include <linux/fs.h>
17#include <linux/pagemap.h> 16#include <linux/pagemap.h>
18#include <linux/mount.h> 17#include <linux/mount.h>
19#include <linux/namei.h> 18#include <linux/namei.h>
19#include <linux/gfp.h>
20#include "internal.h" 20#include "internal.h"
21 21
22 22
@@ -49,9 +49,6 @@ static unsigned long afs_mntpt_expiry_timeout = 10 * 60;
49 */ 49 */
50int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key) 50int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key)
51{ 51{
52 struct file file = {
53 .private_data = key,
54 };
55 struct page *page; 52 struct page *page;
56 size_t size; 53 size_t size;
57 char *buf; 54 char *buf;
@@ -61,7 +58,8 @@ int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key)
61 vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique); 58 vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique);
62 59
63 /* read the contents of the symlink into the pagecache */ 60 /* read the contents of the symlink into the pagecache */
64 page = read_mapping_page(AFS_VNODE_TO_I(vnode)->i_mapping, 0, &file); 61 page = read_cache_page(AFS_VNODE_TO_I(vnode)->i_mapping, 0,
62 afs_page_filler, key);
65 if (IS_ERR(page)) { 63 if (IS_ERR(page)) {
66 ret = PTR_ERR(page); 64 ret = PTR_ERR(page);
67 goto out; 65 goto out;
@@ -138,9 +136,9 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
138{ 136{
139 struct afs_super_info *super; 137 struct afs_super_info *super;
140 struct vfsmount *mnt; 138 struct vfsmount *mnt;
141 struct page *page = NULL; 139 struct page *page;
142 size_t size; 140 size_t size;
143 char *buf, *devname = NULL, *options = NULL; 141 char *buf, *devname, *options;
144 int ret; 142 int ret;
145 143
146 _enter("{%s}", mntpt->d_name.name); 144 _enter("{%s}", mntpt->d_name.name);
@@ -150,22 +148,22 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
150 ret = -EINVAL; 148 ret = -EINVAL;
151 size = mntpt->d_inode->i_size; 149 size = mntpt->d_inode->i_size;
152 if (size > PAGE_SIZE - 1) 150 if (size > PAGE_SIZE - 1)
153 goto error; 151 goto error_no_devname;
154 152
155 ret = -ENOMEM; 153 ret = -ENOMEM;
156 devname = (char *) get_zeroed_page(GFP_KERNEL); 154 devname = (char *) get_zeroed_page(GFP_KERNEL);
157 if (!devname) 155 if (!devname)
158 goto error; 156 goto error_no_devname;
159 157
160 options = (char *) get_zeroed_page(GFP_KERNEL); 158 options = (char *) get_zeroed_page(GFP_KERNEL);
161 if (!options) 159 if (!options)
162 goto error; 160 goto error_no_options;
163 161
164 /* read the contents of the AFS special symlink */ 162 /* read the contents of the AFS special symlink */
165 page = read_mapping_page(mntpt->d_inode->i_mapping, 0, NULL); 163 page = read_mapping_page(mntpt->d_inode->i_mapping, 0, NULL);
166 if (IS_ERR(page)) { 164 if (IS_ERR(page)) {
167 ret = PTR_ERR(page); 165 ret = PTR_ERR(page);
168 goto error; 166 goto error_no_page;
169 } 167 }
170 168
171 ret = -EIO; 169 ret = -EIO;
@@ -196,12 +194,12 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
196 return mnt; 194 return mnt;
197 195
198error: 196error:
199 if (page) 197 page_cache_release(page);
200 page_cache_release(page); 198error_no_page:
201 if (devname) 199 free_page((unsigned long) options);
202 free_page((unsigned long) devname); 200error_no_options:
203 if (options) 201 free_page((unsigned long) devname);
204 free_page((unsigned long) options); 202error_no_devname:
205 _leave(" = %d", ret); 203 _leave(" = %d", ret);
206 return ERR_PTR(ret); 204 return ERR_PTR(ret);
207} 205}
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index bde3f19c0995..67cf810e0fd6 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -9,6 +9,7 @@
9 * 2 of the License, or (at your option) any later version. 9 * 2 of the License, or (at your option) any later version.
10 */ 10 */
11 11
12#include <linux/slab.h>
12#include <net/sock.h> 13#include <net/sock.h>
13#include <net/af_rxrpc.h> 14#include <net/af_rxrpc.h>
14#include <rxrpc/packet.h> 15#include <rxrpc/packet.h>
diff --git a/fs/afs/security.c b/fs/afs/security.c
index 3ef504370034..bb4ed144d0e4 100644
--- a/fs/afs/security.c
+++ b/fs/afs/security.c
@@ -189,8 +189,9 @@ void afs_cache_permit(struct afs_vnode *vnode, struct key *key, long acl_order)
189 if (!permits) 189 if (!permits)
190 goto out_unlock; 190 goto out_unlock;
191 191
192 memcpy(permits->permits, xpermits->permits, 192 if (xpermits)
193 count * sizeof(struct afs_permit)); 193 memcpy(permits->permits, xpermits->permits,
194 count * sizeof(struct afs_permit));
194 195
195 _debug("key %x access %x", 196 _debug("key %x access %x",
196 key_serial(key), vnode->status.caller_access); 197 key_serial(key), vnode->status.caller_access);
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 14f6431598ad..e932e5a3a0c1 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -311,6 +311,7 @@ static int afs_fill_super(struct super_block *sb, void *data)
311 sb->s_magic = AFS_FS_MAGIC; 311 sb->s_magic = AFS_FS_MAGIC;
312 sb->s_op = &afs_super_ops; 312 sb->s_op = &afs_super_ops;
313 sb->s_fs_info = as; 313 sb->s_fs_info = as;
314 sb->s_bdi = &as->volume->bdi;
314 315
315 /* allocate the root inode and dentry */ 316 /* allocate the root inode and dentry */
316 fid.vid = as->volume->vid; 317 fid.vid = as->volume->vid;
diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c
index 36c1306e09e0..340afd0cd182 100644
--- a/fs/afs/vlclient.c
+++ b/fs/afs/vlclient.c
@@ -9,6 +9,7 @@
9 * 2 of the License, or (at your option) any later version. 9 * 2 of the License, or (at your option) any later version.
10 */ 10 */
11 11
12#include <linux/gfp.h>
12#include <linux/init.h> 13#include <linux/init.h>
13#include <linux/sched.h> 14#include <linux/sched.h>
14#include "internal.h" 15#include "internal.h"
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index 6e689208def2..9ac260d1361d 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -11,6 +11,7 @@
11 11
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/slab.h>
14#include <linux/init.h> 15#include <linux/init.h>
15#include <linux/sched.h> 16#include <linux/sched.h>
16#include "internal.h" 17#include "internal.h"
diff --git a/fs/afs/vnode.c b/fs/afs/vnode.c
index 2f05c4fc2a70..25cf4c3f4ff7 100644
--- a/fs/afs/vnode.c
+++ b/fs/afs/vnode.c
@@ -12,7 +12,6 @@
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/slab.h>
16#include <linux/fs.h> 15#include <linux/fs.h>
17#include <linux/sched.h> 16#include <linux/sched.h>
18#include "internal.h" 17#include "internal.h"
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index a353e69e2391..401eeb21869f 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -106,6 +106,10 @@ struct afs_volume *afs_volume_lookup(struct afs_mount_params *params)
106 volume->cell = params->cell; 106 volume->cell = params->cell;
107 volume->vid = vlocation->vldb.vid[params->type]; 107 volume->vid = vlocation->vldb.vid[params->type];
108 108
109 ret = bdi_setup_and_register(&volume->bdi, "afs", BDI_CAP_MAP_COPY);
110 if (ret)
111 goto error_bdi;
112
109 init_rwsem(&volume->server_sem); 113 init_rwsem(&volume->server_sem);
110 114
111 /* look up all the applicable server records */ 115 /* look up all the applicable server records */
@@ -151,6 +155,8 @@ error:
151 return ERR_PTR(ret); 155 return ERR_PTR(ret);
152 156
153error_discard: 157error_discard:
158 bdi_destroy(&volume->bdi);
159error_bdi:
154 up_write(&params->cell->vl_sem); 160 up_write(&params->cell->vl_sem);
155 161
156 for (loop = volume->nservers - 1; loop >= 0; loop--) 162 for (loop = volume->nservers - 1; loop >= 0; loop--)
@@ -200,6 +206,7 @@ void afs_put_volume(struct afs_volume *volume)
200 for (loop = volume->nservers - 1; loop >= 0; loop--) 206 for (loop = volume->nservers - 1; loop >= 0; loop--)
201 afs_put_server(volume->servers[loop]); 207 afs_put_server(volume->servers[loop]);
202 208
209 bdi_destroy(&volume->bdi);
203 kfree(volume); 210 kfree(volume);
204 211
205 _leave(" [destroyed]"); 212 _leave(" [destroyed]");
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 3bed54a294d4..3dab9e9948d0 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -701,8 +701,9 @@ int afs_writeback_all(struct afs_vnode *vnode)
701 * - the return status from this call provides a reliable indication of 701 * - the return status from this call provides a reliable indication of
702 * whether any write errors occurred for this process. 702 * whether any write errors occurred for this process.
703 */ 703 */
704int afs_fsync(struct file *file, struct dentry *dentry, int datasync) 704int afs_fsync(struct file *file, int datasync)
705{ 705{
706 struct dentry *dentry = file->f_path.dentry;
706 struct afs_writeback *wb, *xwb; 707 struct afs_writeback *wb, *xwb;
707 struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode); 708 struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode);
708 int ret; 709 int ret;
diff --git a/fs/aio.c b/fs/aio.c
index 1cf12b3dd83a..1ccf25cef1f0 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -36,6 +36,7 @@
36#include <linux/blkdev.h> 36#include <linux/blkdev.h>
37#include <linux/mempool.h> 37#include <linux/mempool.h>
38#include <linux/hash.h> 38#include <linux/hash.h>
39#include <linux/compat.h>
39 40
40#include <asm/kmap_types.h> 41#include <asm/kmap_types.h>
41#include <asm/uaccess.h> 42#include <asm/uaccess.h>
@@ -526,7 +527,7 @@ static void aio_fput_routine(struct work_struct *data)
526 527
527 /* Complete the fput(s) */ 528 /* Complete the fput(s) */
528 if (req->ki_filp != NULL) 529 if (req->ki_filp != NULL)
529 __fput(req->ki_filp); 530 fput(req->ki_filp);
530 531
531 /* Link the iocb into the context's free list */ 532 /* Link the iocb into the context's free list */
532 spin_lock_irq(&ctx->ctx_lock); 533 spin_lock_irq(&ctx->ctx_lock);
@@ -559,11 +560,11 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
559 560
560 /* 561 /*
561 * Try to optimize the aio and eventfd file* puts, by avoiding to 562 * Try to optimize the aio and eventfd file* puts, by avoiding to
562 * schedule work in case it is not __fput() time. In normal cases, 563 * schedule work in case it is not final fput() time. In normal cases,
563 * we would not be holding the last reference to the file*, so 564 * we would not be holding the last reference to the file*, so
564 * this function will be executed w/out any aio kthread wakeup. 565 * this function will be executed w/out any aio kthread wakeup.
565 */ 566 */
566 if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count))) { 567 if (unlikely(!fput_atomic(req->ki_filp))) {
567 get_ioctx(ctx); 568 get_ioctx(ctx);
568 spin_lock(&fput_lock); 569 spin_lock(&fput_lock);
569 list_add(&req->ki_list, &fput_head); 570 list_add(&req->ki_list, &fput_head);
@@ -1384,13 +1385,22 @@ static ssize_t aio_fsync(struct kiocb *iocb)
1384 return ret; 1385 return ret;
1385} 1386}
1386 1387
1387static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb) 1388static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb, bool compat)
1388{ 1389{
1389 ssize_t ret; 1390 ssize_t ret;
1390 1391
1391 ret = rw_copy_check_uvector(type, (struct iovec __user *)kiocb->ki_buf, 1392#ifdef CONFIG_COMPAT
1392 kiocb->ki_nbytes, 1, 1393 if (compat)
1393 &kiocb->ki_inline_vec, &kiocb->ki_iovec); 1394 ret = compat_rw_copy_check_uvector(type,
1395 (struct compat_iovec __user *)kiocb->ki_buf,
1396 kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec,
1397 &kiocb->ki_iovec);
1398 else
1399#endif
1400 ret = rw_copy_check_uvector(type,
1401 (struct iovec __user *)kiocb->ki_buf,
1402 kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec,
1403 &kiocb->ki_iovec);
1394 if (ret < 0) 1404 if (ret < 0)
1395 goto out; 1405 goto out;
1396 1406
@@ -1420,7 +1430,7 @@ static ssize_t aio_setup_single_vector(struct kiocb *kiocb)
1420 * Performs the initial checks and aio retry method 1430 * Performs the initial checks and aio retry method
1421 * setup for the kiocb at the time of io submission. 1431 * setup for the kiocb at the time of io submission.
1422 */ 1432 */
1423static ssize_t aio_setup_iocb(struct kiocb *kiocb) 1433static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat)
1424{ 1434{
1425 struct file *file = kiocb->ki_filp; 1435 struct file *file = kiocb->ki_filp;
1426 ssize_t ret = 0; 1436 ssize_t ret = 0;
@@ -1469,7 +1479,7 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb)
1469 ret = security_file_permission(file, MAY_READ); 1479 ret = security_file_permission(file, MAY_READ);
1470 if (unlikely(ret)) 1480 if (unlikely(ret))
1471 break; 1481 break;
1472 ret = aio_setup_vectored_rw(READ, kiocb); 1482 ret = aio_setup_vectored_rw(READ, kiocb, compat);
1473 if (ret) 1483 if (ret)
1474 break; 1484 break;
1475 ret = -EINVAL; 1485 ret = -EINVAL;
@@ -1483,7 +1493,7 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb)
1483 ret = security_file_permission(file, MAY_WRITE); 1493 ret = security_file_permission(file, MAY_WRITE);
1484 if (unlikely(ret)) 1494 if (unlikely(ret))
1485 break; 1495 break;
1486 ret = aio_setup_vectored_rw(WRITE, kiocb); 1496 ret = aio_setup_vectored_rw(WRITE, kiocb, compat);
1487 if (ret) 1497 if (ret)
1488 break; 1498 break;
1489 ret = -EINVAL; 1499 ret = -EINVAL;
@@ -1548,7 +1558,8 @@ static void aio_batch_free(struct hlist_head *batch_hash)
1548} 1558}
1549 1559
1550static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, 1560static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
1551 struct iocb *iocb, struct hlist_head *batch_hash) 1561 struct iocb *iocb, struct hlist_head *batch_hash,
1562 bool compat)
1552{ 1563{
1553 struct kiocb *req; 1564 struct kiocb *req;
1554 struct file *file; 1565 struct file *file;
@@ -1609,7 +1620,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
1609 req->ki_left = req->ki_nbytes = iocb->aio_nbytes; 1620 req->ki_left = req->ki_nbytes = iocb->aio_nbytes;
1610 req->ki_opcode = iocb->aio_lio_opcode; 1621 req->ki_opcode = iocb->aio_lio_opcode;
1611 1622
1612 ret = aio_setup_iocb(req); 1623 ret = aio_setup_iocb(req, compat);
1613 1624
1614 if (ret) 1625 if (ret)
1615 goto out_put_req; 1626 goto out_put_req;
@@ -1637,20 +1648,8 @@ out_put_req:
1637 return ret; 1648 return ret;
1638} 1649}
1639 1650
1640/* sys_io_submit: 1651long do_io_submit(aio_context_t ctx_id, long nr,
1641 * Queue the nr iocbs pointed to by iocbpp for processing. Returns 1652 struct iocb __user *__user *iocbpp, bool compat)
1642 * the number of iocbs queued. May return -EINVAL if the aio_context
1643 * specified by ctx_id is invalid, if nr is < 0, if the iocb at
1644 * *iocbpp[0] is not properly initialized, if the operation specified
1645 * is invalid for the file descriptor in the iocb. May fail with
1646 * -EFAULT if any of the data structures point to invalid data. May
1647 * fail with -EBADF if the file descriptor specified in the first
1648 * iocb is invalid. May fail with -EAGAIN if insufficient resources
1649 * are available to queue any iocbs. Will return 0 if nr is 0. Will
1650 * fail with -ENOSYS if not implemented.
1651 */
1652SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
1653 struct iocb __user * __user *, iocbpp)
1654{ 1653{
1655 struct kioctx *ctx; 1654 struct kioctx *ctx;
1656 long ret = 0; 1655 long ret = 0;
@@ -1687,7 +1686,7 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
1687 break; 1686 break;
1688 } 1687 }
1689 1688
1690 ret = io_submit_one(ctx, user_iocb, &tmp, batch_hash); 1689 ret = io_submit_one(ctx, user_iocb, &tmp, batch_hash, compat);
1691 if (ret) 1690 if (ret)
1692 break; 1691 break;
1693 } 1692 }
@@ -1697,6 +1696,24 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
1697 return i ? i : ret; 1696 return i ? i : ret;
1698} 1697}
1699 1698
1699/* sys_io_submit:
1700 * Queue the nr iocbs pointed to by iocbpp for processing. Returns
1701 * the number of iocbs queued. May return -EINVAL if the aio_context
1702 * specified by ctx_id is invalid, if nr is < 0, if the iocb at
1703 * *iocbpp[0] is not properly initialized, if the operation specified
1704 * is invalid for the file descriptor in the iocb. May fail with
1705 * -EFAULT if any of the data structures point to invalid data. May
1706 * fail with -EBADF if the file descriptor specified in the first
1707 * iocb is invalid. May fail with -EAGAIN if insufficient resources
1708 * are available to queue any iocbs. Will return 0 if nr is 0. Will
1709 * fail with -ENOSYS if not implemented.
1710 */
1711SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
1712 struct iocb __user * __user *, iocbpp)
1713{
1714 return do_io_submit(ctx_id, nr, iocbpp, 0);
1715}
1716
1700/* lookup_kiocb 1717/* lookup_kiocb
1701 * Finds a given iocb for cancellation. 1718 * Finds a given iocb for cancellation.
1702 */ 1719 */
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 2de009565d8e..e4b75d6eda83 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -12,7 +12,6 @@
12#include <linux/file.h> 12#include <linux/file.h>
13#include <linux/poll.h> 13#include <linux/poll.h>
14#include <linux/sched.h> 14#include <linux/sched.h>
15#include <linux/slab.h>
16#include <linux/init.h> 15#include <linux/init.h>
17#include <linux/fs.h> 16#include <linux/fs.h>
18#include <linux/mount.h> 17#include <linux/mount.h>
diff --git a/fs/attr.c b/fs/attr.c
index 0815e93bb487..b4fa3b0aa596 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -67,14 +67,14 @@ EXPORT_SYMBOL(inode_change_ok);
67 * @offset: the new size to assign to the inode 67 * @offset: the new size to assign to the inode
68 * @Returns: 0 on success, -ve errno on failure 68 * @Returns: 0 on success, -ve errno on failure
69 * 69 *
70 * inode_newsize_ok must be called with i_mutex held.
71 *
70 * inode_newsize_ok will check filesystem limits and ulimits to check that the 72 * inode_newsize_ok will check filesystem limits and ulimits to check that the
71 * new inode size is within limits. inode_newsize_ok will also send SIGXFSZ 73 * new inode size is within limits. inode_newsize_ok will also send SIGXFSZ
72 * when necessary. Caller must not proceed with inode size change if failure is 74 * when necessary. Caller must not proceed with inode size change if failure is
73 * returned. @inode must be a file (not directory), with appropriate 75 * returned. @inode must be a file (not directory), with appropriate
74 * permissions to allow truncate (inode_newsize_ok does NOT check these 76 * permissions to allow truncate (inode_newsize_ok does NOT check these
75 * conditions). 77 * conditions).
76 *
77 * inode_newsize_ok must be called with i_mutex held.
78 */ 78 */
79int inode_newsize_ok(const struct inode *inode, loff_t offset) 79int inode_newsize_ok(const struct inode *inode, loff_t offset)
80{ 80{
@@ -104,17 +104,25 @@ out_big:
104} 104}
105EXPORT_SYMBOL(inode_newsize_ok); 105EXPORT_SYMBOL(inode_newsize_ok);
106 106
107int inode_setattr(struct inode * inode, struct iattr * attr) 107/**
108 * generic_setattr - copy simple metadata updates into the generic inode
109 * @inode: the inode to be updated
110 * @attr: the new attributes
111 *
112 * generic_setattr must be called with i_mutex held.
113 *
114 * generic_setattr updates the inode's metadata with that specified
115 * in attr. Noticably missing is inode size update, which is more complex
116 * as it requires pagecache updates. See simple_setsize.
117 *
118 * The inode is not marked as dirty after this operation. The rationale is
119 * that for "simple" filesystems, the struct inode is the inode storage.
120 * The caller is free to mark the inode dirty afterwards if needed.
121 */
122void generic_setattr(struct inode *inode, const struct iattr *attr)
108{ 123{
109 unsigned int ia_valid = attr->ia_valid; 124 unsigned int ia_valid = attr->ia_valid;
110 125
111 if (ia_valid & ATTR_SIZE &&
112 attr->ia_size != i_size_read(inode)) {
113 int error = vmtruncate(inode, attr->ia_size);
114 if (error)
115 return error;
116 }
117
118 if (ia_valid & ATTR_UID) 126 if (ia_valid & ATTR_UID)
119 inode->i_uid = attr->ia_uid; 127 inode->i_uid = attr->ia_uid;
120 if (ia_valid & ATTR_GID) 128 if (ia_valid & ATTR_GID)
@@ -135,6 +143,28 @@ int inode_setattr(struct inode * inode, struct iattr * attr)
135 mode &= ~S_ISGID; 143 mode &= ~S_ISGID;
136 inode->i_mode = mode; 144 inode->i_mode = mode;
137 } 145 }
146}
147EXPORT_SYMBOL(generic_setattr);
148
149/*
150 * note this function is deprecated, the new truncate sequence should be
151 * used instead -- see eg. simple_setsize, generic_setattr.
152 */
153int inode_setattr(struct inode *inode, const struct iattr *attr)
154{
155 unsigned int ia_valid = attr->ia_valid;
156
157 if (ia_valid & ATTR_SIZE &&
158 attr->ia_size != i_size_read(inode)) {
159 int error;
160
161 error = vmtruncate(inode, attr->ia_size);
162 if (error)
163 return error;
164 }
165
166 generic_setattr(inode, attr);
167
138 mark_inode_dirty(inode); 168 mark_inode_dirty(inode);
139 169
140 return 0; 170 return 0;
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index 4a1401cea0a1..9a0520b50663 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -13,6 +13,7 @@
13#include <linux/capability.h> 13#include <linux/capability.h>
14#include <linux/errno.h> 14#include <linux/errno.h>
15#include <linux/stat.h> 15#include <linux/stat.h>
16#include <linux/slab.h>
16#include <linux/param.h> 17#include <linux/param.h>
17#include <linux/time.h> 18#include <linux/time.h>
18#include <linux/smp_lock.h> 19#include <linux/smp_lock.h>
@@ -27,6 +28,7 @@ static int autofs_root_mkdir(struct inode *,struct dentry *,int);
27static int autofs_root_ioctl(struct inode *, struct file *,unsigned int,unsigned long); 28static int autofs_root_ioctl(struct inode *, struct file *,unsigned int,unsigned long);
28 29
29const struct file_operations autofs_root_operations = { 30const struct file_operations autofs_root_operations = {
31 .llseek = generic_file_llseek,
30 .read = generic_read_dir, 32 .read = generic_read_dir,
31 .readdir = autofs_root_readdir, 33 .readdir = autofs_root_readdir,
32 .ioctl = autofs_root_ioctl, 34 .ioctl = autofs_root_ioctl,
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index c8a80dffb455..ba4a38b9c22f 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -22,6 +22,7 @@
22#include <linux/magic.h> 22#include <linux/magic.h>
23#include <linux/dcache.h> 23#include <linux/dcache.h>
24#include <linux/uaccess.h> 24#include <linux/uaccess.h>
25#include <linux/slab.h>
25 26
26#include "autofs_i.h" 27#include "autofs_i.h"
27 28
@@ -94,7 +95,7 @@ static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param)
94 */ 95 */
95static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *in) 96static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *in)
96{ 97{
97 struct autofs_dev_ioctl tmp, *ads; 98 struct autofs_dev_ioctl tmp;
98 99
99 if (copy_from_user(&tmp, in, sizeof(tmp))) 100 if (copy_from_user(&tmp, in, sizeof(tmp)))
100 return ERR_PTR(-EFAULT); 101 return ERR_PTR(-EFAULT);
@@ -102,16 +103,7 @@ static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *i
102 if (tmp.size < sizeof(tmp)) 103 if (tmp.size < sizeof(tmp))
103 return ERR_PTR(-EINVAL); 104 return ERR_PTR(-EINVAL);
104 105
105 ads = kmalloc(tmp.size, GFP_KERNEL); 106 return memdup_user(in, tmp.size);
106 if (!ads)
107 return ERR_PTR(-ENOMEM);
108
109 if (copy_from_user(ads, in, tmp.size)) {
110 kfree(ads);
111 return ERR_PTR(-EFAULT);
112 }
113
114 return ads;
115} 107}
116 108
117static inline void free_dev_ioctl(struct autofs_dev_ioctl *param) 109static inline void free_dev_ioctl(struct autofs_dev_ioctl *param)
@@ -735,11 +727,14 @@ static const struct file_operations _dev_ioctl_fops = {
735}; 727};
736 728
737static struct miscdevice _autofs_dev_ioctl_misc = { 729static struct miscdevice _autofs_dev_ioctl_misc = {
738 .minor = MISC_DYNAMIC_MINOR, 730 .minor = AUTOFS_MINOR,
739 .name = AUTOFS_DEVICE_NAME, 731 .name = AUTOFS_DEVICE_NAME,
740 .fops = &_dev_ioctl_fops 732 .fops = &_dev_ioctl_fops
741}; 733};
742 734
735MODULE_ALIAS_MISCDEV(AUTOFS_MINOR);
736MODULE_ALIAS("devname:autofs");
737
743/* Register/deregister misc character device */ 738/* Register/deregister misc character device */
744int autofs_dev_ioctl_init(void) 739int autofs_dev_ioctl_init(void)
745{ 740{
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index a015b49891df..db4117ed7803 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -15,15 +15,17 @@
15#include <linux/capability.h> 15#include <linux/capability.h>
16#include <linux/errno.h> 16#include <linux/errno.h>
17#include <linux/stat.h> 17#include <linux/stat.h>
18#include <linux/slab.h>
18#include <linux/param.h> 19#include <linux/param.h>
19#include <linux/time.h> 20#include <linux/time.h>
21#include <linux/smp_lock.h>
20#include "autofs_i.h" 22#include "autofs_i.h"
21 23
22static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *); 24static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *);
23static int autofs4_dir_unlink(struct inode *,struct dentry *); 25static int autofs4_dir_unlink(struct inode *,struct dentry *);
24static int autofs4_dir_rmdir(struct inode *,struct dentry *); 26static int autofs4_dir_rmdir(struct inode *,struct dentry *);
25static int autofs4_dir_mkdir(struct inode *,struct dentry *,int); 27static int autofs4_dir_mkdir(struct inode *,struct dentry *,int);
26static int autofs4_root_ioctl(struct inode *, struct file *,unsigned int,unsigned long); 28static long autofs4_root_ioctl(struct file *,unsigned int,unsigned long);
27static int autofs4_dir_open(struct inode *inode, struct file *file); 29static int autofs4_dir_open(struct inode *inode, struct file *file);
28static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *); 30static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *);
29static void *autofs4_follow_link(struct dentry *, struct nameidata *); 31static void *autofs4_follow_link(struct dentry *, struct nameidata *);
@@ -37,7 +39,7 @@ const struct file_operations autofs4_root_operations = {
37 .read = generic_read_dir, 39 .read = generic_read_dir,
38 .readdir = dcache_readdir, 40 .readdir = dcache_readdir,
39 .llseek = dcache_dir_lseek, 41 .llseek = dcache_dir_lseek,
40 .ioctl = autofs4_root_ioctl, 42 .unlocked_ioctl = autofs4_root_ioctl,
41}; 43};
42 44
43const struct file_operations autofs4_dir_operations = { 45const struct file_operations autofs4_dir_operations = {
@@ -176,8 +178,7 @@ static int try_to_fill_dentry(struct dentry *dentry, int flags)
176 } 178 }
177 /* Trigger mount for path component or follow link */ 179 /* Trigger mount for path component or follow link */
178 } else if (ino->flags & AUTOFS_INF_PENDING || 180 } else if (ino->flags & AUTOFS_INF_PENDING ||
179 autofs4_need_mount(flags) || 181 autofs4_need_mount(flags)) {
180 current->link_count) {
181 DPRINTK("waiting for mount name=%.*s", 182 DPRINTK("waiting for mount name=%.*s",
182 dentry->d_name.len, dentry->d_name.name); 183 dentry->d_name.len, dentry->d_name.name);
183 184
@@ -261,7 +262,7 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
261 spin_unlock(&dcache_lock); 262 spin_unlock(&dcache_lock);
262 spin_unlock(&sbi->fs_lock); 263 spin_unlock(&sbi->fs_lock);
263 264
264 status = try_to_fill_dentry(dentry, 0); 265 status = try_to_fill_dentry(dentry, nd->flags);
265 if (status) 266 if (status)
266 goto out_error; 267 goto out_error;
267 268
@@ -902,8 +903,8 @@ int is_autofs4_dentry(struct dentry *dentry)
902 * ioctl()'s on the root directory is the chief method for the daemon to 903 * ioctl()'s on the root directory is the chief method for the daemon to
903 * generate kernel reactions 904 * generate kernel reactions
904 */ 905 */
905static int autofs4_root_ioctl(struct inode *inode, struct file *filp, 906static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp,
906 unsigned int cmd, unsigned long arg) 907 unsigned int cmd, unsigned long arg)
907{ 908{
908 struct autofs_sb_info *sbi = autofs4_sbi(inode->i_sb); 909 struct autofs_sb_info *sbi = autofs4_sbi(inode->i_sb);
909 void __user *p = (void __user *)arg; 910 void __user *p = (void __user *)arg;
@@ -947,3 +948,16 @@ static int autofs4_root_ioctl(struct inode *inode, struct file *filp,
947 return -ENOSYS; 948 return -ENOSYS;
948 } 949 }
949} 950}
951
952static long autofs4_root_ioctl(struct file *filp,
953 unsigned int cmd, unsigned long arg)
954{
955 long ret;
956 struct inode *inode = filp->f_dentry->d_inode;
957
958 lock_kernel();
959 ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
960 unlock_kernel();
961
962 return ret;
963}
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index a05287a23f62..52e59bf4aa5f 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -93,8 +93,7 @@ static int bad_file_release(struct inode *inode, struct file *filp)
93 return -EIO; 93 return -EIO;
94} 94}
95 95
96static int bad_file_fsync(struct file *file, struct dentry *dentry, 96static int bad_file_fsync(struct file *file, int datasync)
97 int datasync)
98{ 97{
99 return -EIO; 98 return -EIO;
100} 99}
diff --git a/fs/befs/datastream.c b/fs/befs/datastream.c
index e3287d0d1a58..59096b5e0fc7 100644
--- a/fs/befs/datastream.c
+++ b/fs/befs/datastream.c
@@ -11,7 +11,6 @@
11 */ 11 */
12 12
13#include <linux/kernel.h> 13#include <linux/kernel.h>
14#include <linux/slab.h>
15#include <linux/buffer_head.h> 14#include <linux/buffer_head.h>
16#include <linux/string.h> 15#include <linux/string.h>
17 16
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 1e41aadb1068..d967e052b779 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -78,7 +78,7 @@ static int bfs_readdir(struct file *f, void *dirent, filldir_t filldir)
78const struct file_operations bfs_dir_operations = { 78const struct file_operations bfs_dir_operations = {
79 .read = generic_read_dir, 79 .read = generic_read_dir,
80 .readdir = bfs_readdir, 80 .readdir = bfs_readdir,
81 .fsync = simple_fsync, 81 .fsync = generic_file_fsync,
82 .llseek = generic_file_llseek, 82 .llseek = generic_file_llseek,
83}; 83};
84 84
@@ -105,14 +105,12 @@ static int bfs_create(struct inode *dir, struct dentry *dentry, int mode,
105 } 105 }
106 set_bit(ino, info->si_imap); 106 set_bit(ino, info->si_imap);
107 info->si_freei--; 107 info->si_freei--;
108 inode->i_uid = current_fsuid(); 108 inode_init_owner(inode, dir, mode);
109 inode->i_gid = (dir->i_mode & S_ISGID) ? dir->i_gid : current_fsgid();
110 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; 109 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
111 inode->i_blocks = 0; 110 inode->i_blocks = 0;
112 inode->i_op = &bfs_file_inops; 111 inode->i_op = &bfs_file_inops;
113 inode->i_fop = &bfs_file_operations; 112 inode->i_fop = &bfs_file_operations;
114 inode->i_mapping->a_ops = &bfs_aops; 113 inode->i_mapping->a_ops = &bfs_aops;
115 inode->i_mode = mode;
116 inode->i_ino = ino; 114 inode->i_ino = ino;
117 BFS_I(inode)->i_dsk_ino = ino; 115 BFS_I(inode)->i_dsk_ino = ino;
118 BFS_I(inode)->i_sblock = 0; 116 BFS_I(inode)->i_sblock = 0;
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index 15d80bb35d6f..f96eff04e11a 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -20,11 +20,11 @@
20#include <linux/fcntl.h> 20#include <linux/fcntl.h>
21#include <linux/ptrace.h> 21#include <linux/ptrace.h>
22#include <linux/user.h> 22#include <linux/user.h>
23#include <linux/slab.h>
24#include <linux/binfmts.h> 23#include <linux/binfmts.h>
25#include <linux/personality.h> 24#include <linux/personality.h>
26#include <linux/init.h> 25#include <linux/init.h>
27#include <linux/coredump.h> 26#include <linux/coredump.h>
27#include <linux/slab.h>
28 28
29#include <asm/system.h> 29#include <asm/system.h>
30#include <asm/uaccess.h> 30#include <asm/uaccess.h>
@@ -75,14 +75,16 @@ static int aout_core_dump(struct coredump_params *cprm)
75 struct file *file = cprm->file; 75 struct file *file = cprm->file;
76 mm_segment_t fs; 76 mm_segment_t fs;
77 int has_dumped = 0; 77 int has_dumped = 0;
78 unsigned long dump_start, dump_size; 78 void __user *dump_start;
79 int dump_size;
79 struct user dump; 80 struct user dump;
80#ifdef __alpha__ 81#ifdef __alpha__
81# define START_DATA(u) (u.start_data) 82# define START_DATA(u) ((void __user *)u.start_data)
82#else 83#else
83# define START_DATA(u) ((u.u_tsize << PAGE_SHIFT) + u.start_code) 84# define START_DATA(u) ((void __user *)((u.u_tsize << PAGE_SHIFT) + \
85 u.start_code))
84#endif 86#endif
85# define START_STACK(u) (u.start_stack) 87# define START_STACK(u) ((void __user *)u.start_stack)
86 88
87 fs = get_fs(); 89 fs = get_fs();
88 set_fs(KERNEL_DS); 90 set_fs(KERNEL_DS);
@@ -104,9 +106,9 @@ static int aout_core_dump(struct coredump_params *cprm)
104 106
105/* make sure we actually have a data and stack area to dump */ 107/* make sure we actually have a data and stack area to dump */
106 set_fs(USER_DS); 108 set_fs(USER_DS);
107 if (!access_ok(VERIFY_READ, (void __user *)START_DATA(dump), dump.u_dsize << PAGE_SHIFT)) 109 if (!access_ok(VERIFY_READ, START_DATA(dump), dump.u_dsize << PAGE_SHIFT))
108 dump.u_dsize = 0; 110 dump.u_dsize = 0;
109 if (!access_ok(VERIFY_READ, (void __user *)START_STACK(dump), dump.u_ssize << PAGE_SHIFT)) 111 if (!access_ok(VERIFY_READ, START_STACK(dump), dump.u_ssize << PAGE_SHIFT))
110 dump.u_ssize = 0; 112 dump.u_ssize = 0;
111 113
112 set_fs(KERNEL_DS); 114 set_fs(KERNEL_DS);
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 2c32d00a6690..2c5f9a0e5d72 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1005,15 +1005,8 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(
1005 } 1005 }
1006 } else if (!mm->start_data) { 1006 } else if (!mm->start_data) {
1007 mm->start_data = seg->addr; 1007 mm->start_data = seg->addr;
1008#ifndef CONFIG_MMU
1009 mm->end_data = seg->addr + phdr->p_memsz; 1008 mm->end_data = seg->addr + phdr->p_memsz;
1010#endif
1011 } 1009 }
1012
1013#ifdef CONFIG_MMU
1014 if (seg->addr + phdr->p_memsz > mm->end_data)
1015 mm->end_data = seg->addr + phdr->p_memsz;
1016#endif
1017 } 1010 }
1018 1011
1019 seg++; 1012 seg++;
@@ -1590,7 +1583,7 @@ static size_t elf_core_vma_data_size(unsigned long mm_flags)
1590 struct vm_area_struct *vma; 1583 struct vm_area_struct *vma;
1591 size_t size = 0; 1584 size_t size = 0;
1592 1585
1593 for (vma = current->mm->mmap; vma; vma->vm_next) 1586 for (vma = current->mm->mmap; vma; vma = vma->vm_next)
1594 if (maydump(vma, mm_flags)) 1587 if (maydump(vma, mm_flags))
1595 size += vma->vm_end - vma->vm_start; 1588 size += vma->vm_end - vma->vm_start;
1596 return size; 1589 return size;
diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c
index 32fb00b52cd0..b8e8b0acf9bd 100644
--- a/fs/binfmt_em86.c
+++ b/fs/binfmt_em86.c
@@ -11,7 +11,6 @@
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/string.h> 12#include <linux/string.h>
13#include <linux/stat.h> 13#include <linux/stat.h>
14#include <linux/slab.h>
15#include <linux/binfmts.h> 14#include <linux/binfmts.h>
16#include <linux/elf.h> 15#include <linux/elf.h>
17#include <linux/init.h> 16#include <linux/init.h>
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index e0e769bdca59..49566c1687d8 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -355,7 +355,7 @@ calc_reloc(unsigned long r, struct lib_info *p, int curid, int internalp)
355 355
356 if (!flat_reloc_valid(r, start_brk - start_data + text_len)) { 356 if (!flat_reloc_valid(r, start_brk - start_data + text_len)) {
357 printk("BINFMT_FLAT: reloc outside program 0x%x (0 - 0x%x/0x%x)", 357 printk("BINFMT_FLAT: reloc outside program 0x%x (0 - 0x%x/0x%x)",
358 (int) r,(int)(start_brk-start_code),(int)text_len); 358 (int) r,(int)(start_brk-start_data+text_len),(int)text_len);
359 goto failed; 359 goto failed;
360 } 360 }
361 361
diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
index 08343505e184..aca9d55afb22 100644
--- a/fs/binfmt_script.c
+++ b/fs/binfmt_script.c
@@ -8,7 +8,6 @@
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/string.h> 9#include <linux/string.h>
10#include <linux/stat.h> 10#include <linux/stat.h>
11#include <linux/slab.h>
12#include <linux/binfmts.h> 11#include <linux/binfmts.h>
13#include <linux/init.h> 12#include <linux/init.h>
14#include <linux/file.h> 13#include <linux/file.h>
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index a16f29e888cd..612a5c38d3c1 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -24,6 +24,7 @@
24#include <linux/mempool.h> 24#include <linux/mempool.h>
25#include <linux/bio.h> 25#include <linux/bio.h>
26#include <linux/workqueue.h> 26#include <linux/workqueue.h>
27#include <linux/slab.h>
27 28
28struct integrity_slab { 29struct integrity_slab {
29 struct kmem_cache *slab; 30 struct kmem_cache *slab;
diff --git a/fs/bio.c b/fs/bio.c
index e1f922184b45..e7bf6ca64dcf 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -554,7 +554,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
554 .bi_rw = bio->bi_rw, 554 .bi_rw = bio->bi_rw,
555 }; 555 };
556 556
557 if (q->merge_bvec_fn(q, &bvm, prev) < len) { 557 if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len) {
558 prev->bv_len -= len; 558 prev->bv_len -= len;
559 return 0; 559 return 0;
560 } 560 }
@@ -607,7 +607,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
607 * merge_bvec_fn() returns number of bytes it can accept 607 * merge_bvec_fn() returns number of bytes it can accept
608 * at this offset 608 * at this offset
609 */ 609 */
610 if (q->merge_bvec_fn(q, &bvm, bvec) < len) { 610 if (q->merge_bvec_fn(q, &bvm, bvec) < bvec->bv_len) {
611 bvec->bv_page = NULL; 611 bvec->bv_page = NULL;
612 bvec->bv_len = 0; 612 bvec->bv_len = 0;
613 bvec->bv_offset = 0; 613 bvec->bv_offset = 0;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index d11d0289f3d2..7346c96308a5 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -172,8 +172,9 @@ blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
172 struct file *file = iocb->ki_filp; 172 struct file *file = iocb->ki_filp;
173 struct inode *inode = file->f_mapping->host; 173 struct inode *inode = file->f_mapping->host;
174 174
175 return blockdev_direct_IO_no_locking(rw, iocb, inode, I_BDEV(inode), 175 return blockdev_direct_IO_no_locking_newtrunc(rw, iocb, inode,
176 iov, offset, nr_segs, blkdev_get_blocks, NULL); 176 I_BDEV(inode), iov, offset, nr_segs,
177 blkdev_get_blocks, NULL);
177} 178}
178 179
179int __sync_blockdev(struct block_device *bdev, int wait) 180int __sync_blockdev(struct block_device *bdev, int wait)
@@ -245,37 +246,14 @@ struct super_block *freeze_bdev(struct block_device *bdev)
245 sb = get_active_super(bdev); 246 sb = get_active_super(bdev);
246 if (!sb) 247 if (!sb)
247 goto out; 248 goto out;
248 if (sb->s_flags & MS_RDONLY) { 249 error = freeze_super(sb);
249 sb->s_frozen = SB_FREEZE_TRANS; 250 if (error) {
250 up_write(&sb->s_umount); 251 deactivate_super(sb);
252 bdev->bd_fsfreeze_count--;
251 mutex_unlock(&bdev->bd_fsfreeze_mutex); 253 mutex_unlock(&bdev->bd_fsfreeze_mutex);
252 return sb; 254 return ERR_PTR(error);
253 }
254
255 sb->s_frozen = SB_FREEZE_WRITE;
256 smp_wmb();
257
258 sync_filesystem(sb);
259
260 sb->s_frozen = SB_FREEZE_TRANS;
261 smp_wmb();
262
263 sync_blockdev(sb->s_bdev);
264
265 if (sb->s_op->freeze_fs) {
266 error = sb->s_op->freeze_fs(sb);
267 if (error) {
268 printk(KERN_ERR
269 "VFS:Filesystem freeze failed\n");
270 sb->s_frozen = SB_UNFROZEN;
271 deactivate_locked_super(sb);
272 bdev->bd_fsfreeze_count--;
273 mutex_unlock(&bdev->bd_fsfreeze_mutex);
274 return ERR_PTR(error);
275 }
276 } 255 }
277 up_write(&sb->s_umount); 256 deactivate_super(sb);
278
279 out: 257 out:
280 sync_blockdev(bdev); 258 sync_blockdev(bdev);
281 mutex_unlock(&bdev->bd_fsfreeze_mutex); 259 mutex_unlock(&bdev->bd_fsfreeze_mutex);
@@ -296,40 +274,22 @@ int thaw_bdev(struct block_device *bdev, struct super_block *sb)
296 274
297 mutex_lock(&bdev->bd_fsfreeze_mutex); 275 mutex_lock(&bdev->bd_fsfreeze_mutex);
298 if (!bdev->bd_fsfreeze_count) 276 if (!bdev->bd_fsfreeze_count)
299 goto out_unlock; 277 goto out;
300 278
301 error = 0; 279 error = 0;
302 if (--bdev->bd_fsfreeze_count > 0) 280 if (--bdev->bd_fsfreeze_count > 0)
303 goto out_unlock; 281 goto out;
304 282
305 if (!sb) 283 if (!sb)
306 goto out_unlock; 284 goto out;
307
308 BUG_ON(sb->s_bdev != bdev);
309 down_write(&sb->s_umount);
310 if (sb->s_flags & MS_RDONLY)
311 goto out_unfrozen;
312
313 if (sb->s_op->unfreeze_fs) {
314 error = sb->s_op->unfreeze_fs(sb);
315 if (error) {
316 printk(KERN_ERR
317 "VFS:Filesystem thaw failed\n");
318 sb->s_frozen = SB_FREEZE_TRANS;
319 bdev->bd_fsfreeze_count++;
320 mutex_unlock(&bdev->bd_fsfreeze_mutex);
321 return error;
322 }
323 }
324
325out_unfrozen:
326 sb->s_frozen = SB_UNFROZEN;
327 smp_wmb();
328 wake_up(&sb->s_wait_unfrozen);
329 285
330 if (sb) 286 error = thaw_super(sb);
331 deactivate_locked_super(sb); 287 if (error) {
332out_unlock: 288 bdev->bd_fsfreeze_count++;
289 mutex_unlock(&bdev->bd_fsfreeze_mutex);
290 return error;
291 }
292out:
333 mutex_unlock(&bdev->bd_fsfreeze_mutex); 293 mutex_unlock(&bdev->bd_fsfreeze_mutex);
334 return 0; 294 return 0;
335} 295}
@@ -350,8 +310,8 @@ static int blkdev_write_begin(struct file *file, struct address_space *mapping,
350 struct page **pagep, void **fsdata) 310 struct page **pagep, void **fsdata)
351{ 311{
352 *pagep = NULL; 312 *pagep = NULL;
353 return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 313 return block_write_begin_newtrunc(file, mapping, pos, len, flags,
354 blkdev_get_block); 314 pagep, fsdata, blkdev_get_block);
355} 315}
356 316
357static int blkdev_write_end(struct file *file, struct address_space *mapping, 317static int blkdev_write_end(struct file *file, struct address_space *mapping,
@@ -399,25 +359,28 @@ static loff_t block_llseek(struct file *file, loff_t offset, int origin)
399 return retval; 359 return retval;
400} 360}
401 361
402/* 362int blkdev_fsync(struct file *filp, int datasync)
403 * Filp is never NULL; the only case when ->fsync() is called with
404 * NULL first argument is nfsd_sync_dir() and that's not a directory.
405 */
406
407static int block_fsync(struct file *filp, struct dentry *dentry, int datasync)
408{ 363{
409 struct block_device *bdev = I_BDEV(filp->f_mapping->host); 364 struct inode *bd_inode = filp->f_mapping->host;
365 struct block_device *bdev = I_BDEV(bd_inode);
410 int error; 366 int error;
411 367
412 error = sync_blockdev(bdev); 368 /*
413 if (error) 369 * There is no need to serialise calls to blkdev_issue_flush with
414 return error; 370 * i_mutex and doing so causes performance issues with concurrent
415 371 * O_SYNC writers to a block device.
416 error = blkdev_issue_flush(bdev, NULL); 372 */
373 mutex_unlock(&bd_inode->i_mutex);
374
375 error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL, BLKDEV_IFL_WAIT);
417 if (error == -EOPNOTSUPP) 376 if (error == -EOPNOTSUPP)
418 error = 0; 377 error = 0;
378
379 mutex_lock(&bd_inode->i_mutex);
380
419 return error; 381 return error;
420} 382}
383EXPORT_SYMBOL(blkdev_fsync);
421 384
422/* 385/*
423 * pseudo-fs 386 * pseudo-fs
@@ -660,41 +623,209 @@ void bd_forget(struct inode *inode)
660 iput(bdev->bd_inode); 623 iput(bdev->bd_inode);
661} 624}
662 625
663int bd_claim(struct block_device *bdev, void *holder) 626/**
627 * bd_may_claim - test whether a block device can be claimed
628 * @bdev: block device of interest
629 * @whole: whole block device containing @bdev, may equal @bdev
630 * @holder: holder trying to claim @bdev
631 *
632 * Test whther @bdev can be claimed by @holder.
633 *
634 * CONTEXT:
635 * spin_lock(&bdev_lock).
636 *
637 * RETURNS:
638 * %true if @bdev can be claimed, %false otherwise.
639 */
640static bool bd_may_claim(struct block_device *bdev, struct block_device *whole,
641 void *holder)
664{ 642{
665 int res;
666 spin_lock(&bdev_lock);
667
668 /* first decide result */
669 if (bdev->bd_holder == holder) 643 if (bdev->bd_holder == holder)
670 res = 0; /* already a holder */ 644 return true; /* already a holder */
671 else if (bdev->bd_holder != NULL) 645 else if (bdev->bd_holder != NULL)
672 res = -EBUSY; /* held by someone else */ 646 return false; /* held by someone else */
673 else if (bdev->bd_contains == bdev) 647 else if (bdev->bd_contains == bdev)
674 res = 0; /* is a whole device which isn't held */ 648 return true; /* is a whole device which isn't held */
675 649
676 else if (bdev->bd_contains->bd_holder == bd_claim) 650 else if (whole->bd_holder == bd_claim)
677 res = 0; /* is a partition of a device that is being partitioned */ 651 return true; /* is a partition of a device that is being partitioned */
678 else if (bdev->bd_contains->bd_holder != NULL) 652 else if (whole->bd_holder != NULL)
679 res = -EBUSY; /* is a partition of a held device */ 653 return false; /* is a partition of a held device */
680 else 654 else
681 res = 0; /* is a partition of an un-held device */ 655 return true; /* is a partition of an un-held device */
656}
657
658/**
659 * bd_prepare_to_claim - prepare to claim a block device
660 * @bdev: block device of interest
661 * @whole: the whole device containing @bdev, may equal @bdev
662 * @holder: holder trying to claim @bdev
663 *
664 * Prepare to claim @bdev. This function fails if @bdev is already
665 * claimed by another holder and waits if another claiming is in
666 * progress. This function doesn't actually claim. On successful
667 * return, the caller has ownership of bd_claiming and bd_holder[s].
668 *
669 * CONTEXT:
670 * spin_lock(&bdev_lock). Might release bdev_lock, sleep and regrab
671 * it multiple times.
672 *
673 * RETURNS:
674 * 0 if @bdev can be claimed, -EBUSY otherwise.
675 */
676static int bd_prepare_to_claim(struct block_device *bdev,
677 struct block_device *whole, void *holder)
678{
679retry:
680 /* if someone else claimed, fail */
681 if (!bd_may_claim(bdev, whole, holder))
682 return -EBUSY;
683
684 /* if someone else is claiming, wait for it to finish */
685 if (whole->bd_claiming && whole->bd_claiming != holder) {
686 wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0);
687 DEFINE_WAIT(wait);
688
689 prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
690 spin_unlock(&bdev_lock);
691 schedule();
692 finish_wait(wq, &wait);
693 spin_lock(&bdev_lock);
694 goto retry;
695 }
696
697 /* yay, all mine */
698 return 0;
699}
700
701/**
702 * bd_start_claiming - start claiming a block device
703 * @bdev: block device of interest
704 * @holder: holder trying to claim @bdev
705 *
706 * @bdev is about to be opened exclusively. Check @bdev can be opened
707 * exclusively and mark that an exclusive open is in progress. Each
708 * successful call to this function must be matched with a call to
709 * either bd_claim() or bd_abort_claiming(). If this function
710 * succeeds, the matching bd_claim() is guaranteed to succeed.
711 *
712 * CONTEXT:
713 * Might sleep.
714 *
715 * RETURNS:
716 * Pointer to the block device containing @bdev on success, ERR_PTR()
717 * value on failure.
718 */
719static struct block_device *bd_start_claiming(struct block_device *bdev,
720 void *holder)
721{
722 struct gendisk *disk;
723 struct block_device *whole;
724 int partno, err;
725
726 might_sleep();
727
728 /*
729 * @bdev might not have been initialized properly yet, look up
730 * and grab the outer block device the hard way.
731 */
732 disk = get_gendisk(bdev->bd_dev, &partno);
733 if (!disk)
734 return ERR_PTR(-ENXIO);
735
736 whole = bdget_disk(disk, 0);
737 put_disk(disk);
738 if (!whole)
739 return ERR_PTR(-ENOMEM);
740
741 /* prepare to claim, if successful, mark claiming in progress */
742 spin_lock(&bdev_lock);
743
744 err = bd_prepare_to_claim(bdev, whole, holder);
745 if (err == 0) {
746 whole->bd_claiming = holder;
747 spin_unlock(&bdev_lock);
748 return whole;
749 } else {
750 spin_unlock(&bdev_lock);
751 bdput(whole);
752 return ERR_PTR(err);
753 }
754}
682 755
683 /* now impose change */ 756/* releases bdev_lock */
684 if (res==0) { 757static void __bd_abort_claiming(struct block_device *whole, void *holder)
758{
759 BUG_ON(whole->bd_claiming != holder);
760 whole->bd_claiming = NULL;
761 wake_up_bit(&whole->bd_claiming, 0);
762
763 spin_unlock(&bdev_lock);
764 bdput(whole);
765}
766
767/**
768 * bd_abort_claiming - abort claiming a block device
769 * @whole: whole block device returned by bd_start_claiming()
770 * @holder: holder trying to claim @bdev
771 *
772 * Abort a claiming block started by bd_start_claiming(). Note that
773 * @whole is not the block device to be claimed but the whole device
774 * returned by bd_start_claiming().
775 *
776 * CONTEXT:
777 * Grabs and releases bdev_lock.
778 */
779static void bd_abort_claiming(struct block_device *whole, void *holder)
780{
781 spin_lock(&bdev_lock);
782 __bd_abort_claiming(whole, holder); /* releases bdev_lock */
783}
784
785/**
786 * bd_claim - claim a block device
787 * @bdev: block device to claim
788 * @holder: holder trying to claim @bdev
789 *
790 * Try to claim @bdev which must have been opened successfully. This
791 * function may be called with or without preceding
792 * blk_start_claiming(). In the former case, this function is always
793 * successful and terminates the claiming block.
794 *
795 * CONTEXT:
796 * Might sleep.
797 *
798 * RETURNS:
799 * 0 if successful, -EBUSY if @bdev is already claimed.
800 */
801int bd_claim(struct block_device *bdev, void *holder)
802{
803 struct block_device *whole = bdev->bd_contains;
804 int res;
805
806 might_sleep();
807
808 spin_lock(&bdev_lock);
809
810 res = bd_prepare_to_claim(bdev, whole, holder);
811 if (res == 0) {
685 /* note that for a whole device bd_holders 812 /* note that for a whole device bd_holders
686 * will be incremented twice, and bd_holder will 813 * will be incremented twice, and bd_holder will
687 * be set to bd_claim before being set to holder 814 * be set to bd_claim before being set to holder
688 */ 815 */
689 bdev->bd_contains->bd_holders ++; 816 whole->bd_holders++;
690 bdev->bd_contains->bd_holder = bd_claim; 817 whole->bd_holder = bd_claim;
691 bdev->bd_holders++; 818 bdev->bd_holders++;
692 bdev->bd_holder = holder; 819 bdev->bd_holder = holder;
693 } 820 }
694 spin_unlock(&bdev_lock); 821
822 if (whole->bd_claiming)
823 __bd_abort_claiming(whole, holder); /* releases bdev_lock */
824 else
825 spin_unlock(&bdev_lock);
826
695 return res; 827 return res;
696} 828}
697
698EXPORT_SYMBOL(bd_claim); 829EXPORT_SYMBOL(bd_claim);
699 830
700void bd_release(struct block_device *bdev) 831void bd_release(struct block_device *bdev)
@@ -1308,6 +1439,7 @@ EXPORT_SYMBOL(blkdev_get);
1308 1439
1309static int blkdev_open(struct inode * inode, struct file * filp) 1440static int blkdev_open(struct inode * inode, struct file * filp)
1310{ 1441{
1442 struct block_device *whole = NULL;
1311 struct block_device *bdev; 1443 struct block_device *bdev;
1312 int res; 1444 int res;
1313 1445
@@ -1330,22 +1462,25 @@ static int blkdev_open(struct inode * inode, struct file * filp)
1330 if (bdev == NULL) 1462 if (bdev == NULL)
1331 return -ENOMEM; 1463 return -ENOMEM;
1332 1464
1465 if (filp->f_mode & FMODE_EXCL) {
1466 whole = bd_start_claiming(bdev, filp);
1467 if (IS_ERR(whole)) {
1468 bdput(bdev);
1469 return PTR_ERR(whole);
1470 }
1471 }
1472
1333 filp->f_mapping = bdev->bd_inode->i_mapping; 1473 filp->f_mapping = bdev->bd_inode->i_mapping;
1334 1474
1335 res = blkdev_get(bdev, filp->f_mode); 1475 res = blkdev_get(bdev, filp->f_mode);
1336 if (res)
1337 return res;
1338 1476
1339 if (filp->f_mode & FMODE_EXCL) { 1477 if (whole) {
1340 res = bd_claim(bdev, filp); 1478 if (res == 0)
1341 if (res) 1479 BUG_ON(bd_claim(bdev, filp) != 0);
1342 goto out_blkdev_put; 1480 else
1481 bd_abort_claiming(whole, filp);
1343 } 1482 }
1344 1483
1345 return 0;
1346
1347 out_blkdev_put:
1348 blkdev_put(bdev, filp->f_mode);
1349 return res; 1484 return res;
1350} 1485}
1351 1486
@@ -1481,7 +1616,7 @@ const struct file_operations def_blk_fops = {
1481 .aio_read = generic_file_aio_read, 1616 .aio_read = generic_file_aio_read,
1482 .aio_write = blkdev_aio_write, 1617 .aio_write = blkdev_aio_write,
1483 .mmap = generic_file_mmap, 1618 .mmap = generic_file_mmap,
1484 .fsync = block_fsync, 1619 .fsync = blkdev_fsync,
1485 .unlocked_ioctl = block_ioctl, 1620 .unlocked_ioctl = block_ioctl,
1486#ifdef CONFIG_COMPAT 1621#ifdef CONFIG_COMPAT
1487 .compat_ioctl = compat_blkdev_ioctl, 1622 .compat_ioctl = compat_blkdev_ioctl,
@@ -1556,27 +1691,34 @@ EXPORT_SYMBOL(lookup_bdev);
1556 */ 1691 */
1557struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder) 1692struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder)
1558{ 1693{
1559 struct block_device *bdev; 1694 struct block_device *bdev, *whole;
1560 int error = 0; 1695 int error;
1561 1696
1562 bdev = lookup_bdev(path); 1697 bdev = lookup_bdev(path);
1563 if (IS_ERR(bdev)) 1698 if (IS_ERR(bdev))
1564 return bdev; 1699 return bdev;
1565 1700
1701 whole = bd_start_claiming(bdev, holder);
1702 if (IS_ERR(whole)) {
1703 bdput(bdev);
1704 return whole;
1705 }
1706
1566 error = blkdev_get(bdev, mode); 1707 error = blkdev_get(bdev, mode);
1567 if (error) 1708 if (error)
1568 return ERR_PTR(error); 1709 goto out_abort_claiming;
1710
1569 error = -EACCES; 1711 error = -EACCES;
1570 if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) 1712 if ((mode & FMODE_WRITE) && bdev_read_only(bdev))
1571 goto blkdev_put; 1713 goto out_blkdev_put;
1572 error = bd_claim(bdev, holder);
1573 if (error)
1574 goto blkdev_put;
1575 1714
1715 BUG_ON(bd_claim(bdev, holder) != 0);
1576 return bdev; 1716 return bdev;
1577 1717
1578blkdev_put: 1718out_blkdev_put:
1579 blkdev_put(bdev, mode); 1719 blkdev_put(bdev, mode);
1720out_abort_claiming:
1721 bd_abort_claiming(whole, holder);
1580 return ERR_PTR(error); 1722 return ERR_PTR(error);
1581} 1723}
1582 1724
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 6df6d6ed74fd..8d432cd9d580 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -22,6 +22,7 @@
22#include <linux/posix_acl_xattr.h> 22#include <linux/posix_acl_xattr.h>
23#include <linux/posix_acl.h> 23#include <linux/posix_acl.h>
24#include <linux/sched.h> 24#include <linux/sched.h>
25#include <linux/slab.h>
25 26
26#include "ctree.h" 27#include "ctree.h"
27#include "btrfs_inode.h" 28#include "btrfs_inode.h"
@@ -281,14 +282,14 @@ int btrfs_acl_chmod(struct inode *inode)
281 return ret; 282 return ret;
282} 283}
283 284
284struct xattr_handler btrfs_xattr_acl_default_handler = { 285const struct xattr_handler btrfs_xattr_acl_default_handler = {
285 .prefix = POSIX_ACL_XATTR_DEFAULT, 286 .prefix = POSIX_ACL_XATTR_DEFAULT,
286 .flags = ACL_TYPE_DEFAULT, 287 .flags = ACL_TYPE_DEFAULT,
287 .get = btrfs_xattr_acl_get, 288 .get = btrfs_xattr_acl_get,
288 .set = btrfs_xattr_acl_set, 289 .set = btrfs_xattr_acl_set,
289}; 290};
290 291
291struct xattr_handler btrfs_xattr_acl_access_handler = { 292const struct xattr_handler btrfs_xattr_acl_access_handler = {
292 .prefix = POSIX_ACL_XATTR_ACCESS, 293 .prefix = POSIX_ACL_XATTR_ACCESS,
293 .flags = ACL_TYPE_ACCESS, 294 .flags = ACL_TYPE_ACCESS,
294 .get = btrfs_xattr_acl_get, 295 .get = btrfs_xattr_acl_get,
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index c0861e781cdb..7ec14097fef1 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/kthread.h> 19#include <linux/kthread.h>
20#include <linux/slab.h>
20#include <linux/list.h> 21#include <linux/list.h>
21#include <linux/spinlock.h> 22#include <linux/spinlock.h>
22#include <linux/freezer.h> 23#include <linux/freezer.h>
@@ -376,6 +377,7 @@ again:
376 if (!list_empty(&worker->pending) || 377 if (!list_empty(&worker->pending) ||
377 !list_empty(&worker->prio_pending)) { 378 !list_empty(&worker->prio_pending)) {
378 spin_unlock_irq(&worker->lock); 379 spin_unlock_irq(&worker->lock);
380 set_current_state(TASK_RUNNING);
379 goto again; 381 goto again;
380 } 382 }
381 383
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 7a4dee199832..6ad63f17eca0 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -137,8 +137,8 @@ struct btrfs_inode {
137 * of extent items we've reserved metadata for. 137 * of extent items we've reserved metadata for.
138 */ 138 */
139 spinlock_t accounting_lock; 139 spinlock_t accounting_lock;
140 atomic_t outstanding_extents;
140 int reserved_extents; 141 int reserved_extents;
141 int outstanding_extents;
142 142
143 /* 143 /*
144 * ordered_data_close is set by truncate when a file that used 144 * ordered_data_close is set by truncate when a file that used
@@ -151,6 +151,7 @@ struct btrfs_inode {
151 * of these. 151 * of these.
152 */ 152 */
153 unsigned ordered_data_close:1; 153 unsigned ordered_data_close:1;
154 unsigned orphan_meta_reserved:1;
154 unsigned dummy_inode:1; 155 unsigned dummy_inode:1;
155 156
156 /* 157 /*
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 28b92a7218ab..396039b3a8a2 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -31,7 +31,7 @@
31#include <linux/swap.h> 31#include <linux/swap.h>
32#include <linux/writeback.h> 32#include <linux/writeback.h>
33#include <linux/bit_spinlock.h> 33#include <linux/bit_spinlock.h>
34#include <linux/pagevec.h> 34#include <linux/slab.h>
35#include "compat.h" 35#include "compat.h"
36#include "ctree.h" 36#include "ctree.h"
37#include "disk-io.h" 37#include "disk-io.h"
@@ -445,7 +445,6 @@ static noinline int add_ra_bio_pages(struct inode *inode,
445 unsigned long nr_pages = 0; 445 unsigned long nr_pages = 0;
446 struct extent_map *em; 446 struct extent_map *em;
447 struct address_space *mapping = inode->i_mapping; 447 struct address_space *mapping = inode->i_mapping;
448 struct pagevec pvec;
449 struct extent_map_tree *em_tree; 448 struct extent_map_tree *em_tree;
450 struct extent_io_tree *tree; 449 struct extent_io_tree *tree;
451 u64 end; 450 u64 end;
@@ -461,7 +460,6 @@ static noinline int add_ra_bio_pages(struct inode *inode,
461 460
462 end_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT; 461 end_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
463 462
464 pagevec_init(&pvec, 0);
465 while (last_offset < compressed_end) { 463 while (last_offset < compressed_end) {
466 page_index = last_offset >> PAGE_CACHE_SHIFT; 464 page_index = last_offset >> PAGE_CACHE_SHIFT;
467 465
@@ -478,26 +476,17 @@ static noinline int add_ra_bio_pages(struct inode *inode,
478 goto next; 476 goto next;
479 } 477 }
480 478
481 page = alloc_page(mapping_gfp_mask(mapping) & ~__GFP_FS); 479 page = __page_cache_alloc(mapping_gfp_mask(mapping) &
480 ~__GFP_FS);
482 if (!page) 481 if (!page)
483 break; 482 break;
484 483
485 page->index = page_index; 484 if (add_to_page_cache_lru(page, mapping, page_index,
486 /* 485 GFP_NOFS)) {
487 * what we want to do here is call add_to_page_cache_lru,
488 * but that isn't exported, so we reproduce it here
489 */
490 if (add_to_page_cache(page, mapping,
491 page->index, GFP_NOFS)) {
492 page_cache_release(page); 486 page_cache_release(page);
493 goto next; 487 goto next;
494 } 488 }
495 489
496 /* open coding of lru_cache_add, also not exported */
497 page_cache_get(page);
498 if (!pagevec_add(&pvec, page))
499 __pagevec_lru_add_file(&pvec);
500
501 end = last_offset + PAGE_CACHE_SIZE - 1; 490 end = last_offset + PAGE_CACHE_SIZE - 1;
502 /* 491 /*
503 * at this point, we have a locked page in the page cache 492 * at this point, we have a locked page in the page cache
@@ -551,8 +540,6 @@ static noinline int add_ra_bio_pages(struct inode *inode,
551next: 540next:
552 last_offset += PAGE_CACHE_SIZE; 541 last_offset += PAGE_CACHE_SIZE;
553 } 542 }
554 if (pagevec_count(&pvec))
555 __pagevec_lru_add_file(&pvec);
556 return 0; 543 return 0;
557} 544}
558 545
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index c4bc570a396e..0d1d966b0fe4 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/slab.h>
20#include "ctree.h" 21#include "ctree.h"
21#include "disk-io.h" 22#include "disk-io.h"
22#include "transaction.h" 23#include "transaction.h"
@@ -279,7 +280,8 @@ int btrfs_block_can_be_shared(struct btrfs_root *root,
279static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, 280static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
280 struct btrfs_root *root, 281 struct btrfs_root *root,
281 struct extent_buffer *buf, 282 struct extent_buffer *buf,
282 struct extent_buffer *cow) 283 struct extent_buffer *cow,
284 int *last_ref)
283{ 285{
284 u64 refs; 286 u64 refs;
285 u64 owner; 287 u64 owner;
@@ -365,6 +367,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
365 BUG_ON(ret); 367 BUG_ON(ret);
366 } 368 }
367 clean_tree_block(trans, root, buf); 369 clean_tree_block(trans, root, buf);
370 *last_ref = 1;
368 } 371 }
369 return 0; 372 return 0;
370} 373}
@@ -391,6 +394,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
391 struct btrfs_disk_key disk_key; 394 struct btrfs_disk_key disk_key;
392 struct extent_buffer *cow; 395 struct extent_buffer *cow;
393 int level; 396 int level;
397 int last_ref = 0;
394 int unlock_orig = 0; 398 int unlock_orig = 0;
395 u64 parent_start; 399 u64 parent_start;
396 400
@@ -441,7 +445,10 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
441 (unsigned long)btrfs_header_fsid(cow), 445 (unsigned long)btrfs_header_fsid(cow),
442 BTRFS_FSID_SIZE); 446 BTRFS_FSID_SIZE);
443 447
444 update_ref_for_cow(trans, root, buf, cow); 448 update_ref_for_cow(trans, root, buf, cow, &last_ref);
449
450 if (root->ref_cows)
451 btrfs_reloc_cow_block(trans, root, buf, cow);
445 452
446 if (buf == root->node) { 453 if (buf == root->node) {
447 WARN_ON(parent && parent != buf); 454 WARN_ON(parent && parent != buf);
@@ -456,8 +463,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
456 extent_buffer_get(cow); 463 extent_buffer_get(cow);
457 spin_unlock(&root->node_lock); 464 spin_unlock(&root->node_lock);
458 465
459 btrfs_free_tree_block(trans, root, buf->start, buf->len, 466 btrfs_free_tree_block(trans, root, buf, parent_start,
460 parent_start, root->root_key.objectid, level); 467 last_ref);
461 free_extent_buffer(buf); 468 free_extent_buffer(buf);
462 add_root_to_dirty_list(root); 469 add_root_to_dirty_list(root);
463 } else { 470 } else {
@@ -472,8 +479,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
472 btrfs_set_node_ptr_generation(parent, parent_slot, 479 btrfs_set_node_ptr_generation(parent, parent_slot,
473 trans->transid); 480 trans->transid);
474 btrfs_mark_buffer_dirty(parent); 481 btrfs_mark_buffer_dirty(parent);
475 btrfs_free_tree_block(trans, root, buf->start, buf->len, 482 btrfs_free_tree_block(trans, root, buf, parent_start,
476 parent_start, root->root_key.objectid, level); 483 last_ref);
477 } 484 }
478 if (unlock_orig) 485 if (unlock_orig)
479 btrfs_tree_unlock(buf); 486 btrfs_tree_unlock(buf);
@@ -948,6 +955,22 @@ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
948 return bin_search(eb, key, level, slot); 955 return bin_search(eb, key, level, slot);
949} 956}
950 957
958static void root_add_used(struct btrfs_root *root, u32 size)
959{
960 spin_lock(&root->accounting_lock);
961 btrfs_set_root_used(&root->root_item,
962 btrfs_root_used(&root->root_item) + size);
963 spin_unlock(&root->accounting_lock);
964}
965
966static void root_sub_used(struct btrfs_root *root, u32 size)
967{
968 spin_lock(&root->accounting_lock);
969 btrfs_set_root_used(&root->root_item,
970 btrfs_root_used(&root->root_item) - size);
971 spin_unlock(&root->accounting_lock);
972}
973
951/* given a node and slot number, this reads the blocks it points to. The 974/* given a node and slot number, this reads the blocks it points to. The
952 * extent buffer is returned with a reference taken (but unlocked). 975 * extent buffer is returned with a reference taken (but unlocked).
953 * NULL is returned on error. 976 * NULL is returned on error.
@@ -1018,7 +1041,11 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1018 btrfs_tree_lock(child); 1041 btrfs_tree_lock(child);
1019 btrfs_set_lock_blocking(child); 1042 btrfs_set_lock_blocking(child);
1020 ret = btrfs_cow_block(trans, root, child, mid, 0, &child); 1043 ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
1021 BUG_ON(ret); 1044 if (ret) {
1045 btrfs_tree_unlock(child);
1046 free_extent_buffer(child);
1047 goto enospc;
1048 }
1022 1049
1023 spin_lock(&root->node_lock); 1050 spin_lock(&root->node_lock);
1024 root->node = child; 1051 root->node = child;
@@ -1033,11 +1060,12 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1033 btrfs_tree_unlock(mid); 1060 btrfs_tree_unlock(mid);
1034 /* once for the path */ 1061 /* once for the path */
1035 free_extent_buffer(mid); 1062 free_extent_buffer(mid);
1036 ret = btrfs_free_tree_block(trans, root, mid->start, mid->len, 1063
1037 0, root->root_key.objectid, level); 1064 root_sub_used(root, mid->len);
1065 btrfs_free_tree_block(trans, root, mid, 0, 1);
1038 /* once for the root ptr */ 1066 /* once for the root ptr */
1039 free_extent_buffer(mid); 1067 free_extent_buffer(mid);
1040 return ret; 1068 return 0;
1041 } 1069 }
1042 if (btrfs_header_nritems(mid) > 1070 if (btrfs_header_nritems(mid) >
1043 BTRFS_NODEPTRS_PER_BLOCK(root) / 4) 1071 BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
@@ -1087,23 +1115,16 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1087 if (wret < 0 && wret != -ENOSPC) 1115 if (wret < 0 && wret != -ENOSPC)
1088 ret = wret; 1116 ret = wret;
1089 if (btrfs_header_nritems(right) == 0) { 1117 if (btrfs_header_nritems(right) == 0) {
1090 u64 bytenr = right->start;
1091 u32 blocksize = right->len;
1092
1093 clean_tree_block(trans, root, right); 1118 clean_tree_block(trans, root, right);
1094 btrfs_tree_unlock(right); 1119 btrfs_tree_unlock(right);
1095 free_extent_buffer(right);
1096 right = NULL;
1097 wret = del_ptr(trans, root, path, level + 1, pslot + 1120 wret = del_ptr(trans, root, path, level + 1, pslot +
1098 1); 1121 1);
1099 if (wret) 1122 if (wret)
1100 ret = wret; 1123 ret = wret;
1101 wret = btrfs_free_tree_block(trans, root, 1124 root_sub_used(root, right->len);
1102 bytenr, blocksize, 0, 1125 btrfs_free_tree_block(trans, root, right, 0, 1);
1103 root->root_key.objectid, 1126 free_extent_buffer(right);
1104 level); 1127 right = NULL;
1105 if (wret)
1106 ret = wret;
1107 } else { 1128 } else {
1108 struct btrfs_disk_key right_key; 1129 struct btrfs_disk_key right_key;
1109 btrfs_node_key(right, &right_key, 0); 1130 btrfs_node_key(right, &right_key, 0);
@@ -1135,21 +1156,15 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1135 BUG_ON(wret == 1); 1156 BUG_ON(wret == 1);
1136 } 1157 }
1137 if (btrfs_header_nritems(mid) == 0) { 1158 if (btrfs_header_nritems(mid) == 0) {
1138 /* we've managed to empty the middle node, drop it */
1139 u64 bytenr = mid->start;
1140 u32 blocksize = mid->len;
1141
1142 clean_tree_block(trans, root, mid); 1159 clean_tree_block(trans, root, mid);
1143 btrfs_tree_unlock(mid); 1160 btrfs_tree_unlock(mid);
1144 free_extent_buffer(mid);
1145 mid = NULL;
1146 wret = del_ptr(trans, root, path, level + 1, pslot); 1161 wret = del_ptr(trans, root, path, level + 1, pslot);
1147 if (wret) 1162 if (wret)
1148 ret = wret; 1163 ret = wret;
1149 wret = btrfs_free_tree_block(trans, root, bytenr, blocksize, 1164 root_sub_used(root, mid->len);
1150 0, root->root_key.objectid, level); 1165 btrfs_free_tree_block(trans, root, mid, 0, 1);
1151 if (wret) 1166 free_extent_buffer(mid);
1152 ret = wret; 1167 mid = NULL;
1153 } else { 1168 } else {
1154 /* update the parent key to reflect our changes */ 1169 /* update the parent key to reflect our changes */
1155 struct btrfs_disk_key mid_key; 1170 struct btrfs_disk_key mid_key;
@@ -1589,7 +1604,7 @@ read_block_for_search(struct btrfs_trans_handle *trans,
1589 btrfs_release_path(NULL, p); 1604 btrfs_release_path(NULL, p);
1590 1605
1591 ret = -EAGAIN; 1606 ret = -EAGAIN;
1592 tmp = read_tree_block(root, blocknr, blocksize, gen); 1607 tmp = read_tree_block(root, blocknr, blocksize, 0);
1593 if (tmp) { 1608 if (tmp) {
1594 /* 1609 /*
1595 * If the read above didn't mark this buffer up to date, 1610 * If the read above didn't mark this buffer up to date,
@@ -1739,7 +1754,6 @@ again:
1739 p->nodes[level + 1], 1754 p->nodes[level + 1],
1740 p->slots[level + 1], &b); 1755 p->slots[level + 1], &b);
1741 if (err) { 1756 if (err) {
1742 free_extent_buffer(b);
1743 ret = err; 1757 ret = err;
1744 goto done; 1758 goto done;
1745 } 1759 }
@@ -2075,6 +2089,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2075 if (IS_ERR(c)) 2089 if (IS_ERR(c))
2076 return PTR_ERR(c); 2090 return PTR_ERR(c);
2077 2091
2092 root_add_used(root, root->nodesize);
2093
2078 memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header)); 2094 memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header));
2079 btrfs_set_header_nritems(c, 1); 2095 btrfs_set_header_nritems(c, 1);
2080 btrfs_set_header_level(c, level); 2096 btrfs_set_header_level(c, level);
@@ -2133,6 +2149,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
2133 int nritems; 2149 int nritems;
2134 2150
2135 BUG_ON(!path->nodes[level]); 2151 BUG_ON(!path->nodes[level]);
2152 btrfs_assert_tree_locked(path->nodes[level]);
2136 lower = path->nodes[level]; 2153 lower = path->nodes[level];
2137 nritems = btrfs_header_nritems(lower); 2154 nritems = btrfs_header_nritems(lower);
2138 BUG_ON(slot > nritems); 2155 BUG_ON(slot > nritems);
@@ -2201,6 +2218,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
2201 if (IS_ERR(split)) 2218 if (IS_ERR(split))
2202 return PTR_ERR(split); 2219 return PTR_ERR(split);
2203 2220
2221 root_add_used(root, root->nodesize);
2222
2204 memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header)); 2223 memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header));
2205 btrfs_set_header_level(split, btrfs_header_level(c)); 2224 btrfs_set_header_level(split, btrfs_header_level(c));
2206 btrfs_set_header_bytenr(split, split->start); 2225 btrfs_set_header_bytenr(split, split->start);
@@ -2414,6 +2433,9 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
2414 2433
2415 if (left_nritems) 2434 if (left_nritems)
2416 btrfs_mark_buffer_dirty(left); 2435 btrfs_mark_buffer_dirty(left);
2436 else
2437 clean_tree_block(trans, root, left);
2438
2417 btrfs_mark_buffer_dirty(right); 2439 btrfs_mark_buffer_dirty(right);
2418 2440
2419 btrfs_item_key(right, &disk_key, 0); 2441 btrfs_item_key(right, &disk_key, 0);
@@ -2659,6 +2681,8 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
2659 btrfs_mark_buffer_dirty(left); 2681 btrfs_mark_buffer_dirty(left);
2660 if (right_nritems) 2682 if (right_nritems)
2661 btrfs_mark_buffer_dirty(right); 2683 btrfs_mark_buffer_dirty(right);
2684 else
2685 clean_tree_block(trans, root, right);
2662 2686
2663 btrfs_item_key(right, &disk_key, 0); 2687 btrfs_item_key(right, &disk_key, 0);
2664 wret = fixup_low_keys(trans, root, path, &disk_key, 1); 2688 wret = fixup_low_keys(trans, root, path, &disk_key, 1);
@@ -2668,8 +2692,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
2668 /* then fixup the leaf pointer in the path */ 2692 /* then fixup the leaf pointer in the path */
2669 if (path->slots[0] < push_items) { 2693 if (path->slots[0] < push_items) {
2670 path->slots[0] += old_left_nritems; 2694 path->slots[0] += old_left_nritems;
2671 if (btrfs_header_nritems(path->nodes[0]) == 0)
2672 clean_tree_block(trans, root, path->nodes[0]);
2673 btrfs_tree_unlock(path->nodes[0]); 2695 btrfs_tree_unlock(path->nodes[0]);
2674 free_extent_buffer(path->nodes[0]); 2696 free_extent_buffer(path->nodes[0]);
2675 path->nodes[0] = left; 2697 path->nodes[0] = left;
@@ -2931,10 +2953,10 @@ again:
2931 right = btrfs_alloc_free_block(trans, root, root->leafsize, 0, 2953 right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
2932 root->root_key.objectid, 2954 root->root_key.objectid,
2933 &disk_key, 0, l->start, 0); 2955 &disk_key, 0, l->start, 0);
2934 if (IS_ERR(right)) { 2956 if (IS_ERR(right))
2935 BUG_ON(1);
2936 return PTR_ERR(right); 2957 return PTR_ERR(right);
2937 } 2958
2959 root_add_used(root, root->leafsize);
2938 2960
2939 memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header)); 2961 memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
2940 btrfs_set_header_bytenr(right, right->start); 2962 btrfs_set_header_bytenr(right, right->start);
@@ -3040,6 +3062,10 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
3040 if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0])) 3062 if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0]))
3041 goto err; 3063 goto err;
3042 3064
3065 /* the leaf has changed, it now has room. return now */
3066 if (btrfs_leaf_free_space(root, path->nodes[0]) >= ins_len)
3067 goto err;
3068
3043 if (key.type == BTRFS_EXTENT_DATA_KEY) { 3069 if (key.type == BTRFS_EXTENT_DATA_KEY) {
3044 fi = btrfs_item_ptr(leaf, path->slots[0], 3070 fi = btrfs_item_ptr(leaf, path->slots[0],
3045 struct btrfs_file_extent_item); 3071 struct btrfs_file_extent_item);
@@ -3049,7 +3075,8 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
3049 3075
3050 btrfs_set_path_blocking(path); 3076 btrfs_set_path_blocking(path);
3051 ret = split_leaf(trans, root, &key, path, ins_len, 1); 3077 ret = split_leaf(trans, root, &key, path, ins_len, 1);
3052 BUG_ON(ret); 3078 if (ret)
3079 goto err;
3053 3080
3054 path->keep_locks = 0; 3081 path->keep_locks = 0;
3055 btrfs_unlock_up_safe(path, 1); 3082 btrfs_unlock_up_safe(path, 1);
@@ -3791,9 +3818,10 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
3791 */ 3818 */
3792 btrfs_unlock_up_safe(path, 0); 3819 btrfs_unlock_up_safe(path, 0);
3793 3820
3794 ret = btrfs_free_tree_block(trans, root, leaf->start, leaf->len, 3821 root_sub_used(root, leaf->len);
3795 0, root->root_key.objectid, 0); 3822
3796 return ret; 3823 btrfs_free_tree_block(trans, root, leaf, 0, 1);
3824 return 0;
3797} 3825}
3798/* 3826/*
3799 * delete the item at the leaf level in path. If that empties 3827 * delete the item at the leaf level in path. If that empties
@@ -3860,6 +3888,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3860 if (leaf == root->node) { 3888 if (leaf == root->node) {
3861 btrfs_set_header_level(leaf, 0); 3889 btrfs_set_header_level(leaf, 0);
3862 } else { 3890 } else {
3891 btrfs_set_path_blocking(path);
3892 clean_tree_block(trans, root, leaf);
3863 ret = btrfs_del_leaf(trans, root, path, leaf); 3893 ret = btrfs_del_leaf(trans, root, path, leaf);
3864 BUG_ON(ret); 3894 BUG_ON(ret);
3865 } 3895 }
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0af2e3868573..29c20092847e 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -26,6 +26,7 @@
26#include <linux/completion.h> 26#include <linux/completion.h>
27#include <linux/backing-dev.h> 27#include <linux/backing-dev.h>
28#include <linux/wait.h> 28#include <linux/wait.h>
29#include <linux/slab.h>
29#include <asm/kmap_types.h> 30#include <asm/kmap_types.h>
30#include "extent_io.h" 31#include "extent_io.h"
31#include "extent_map.h" 32#include "extent_map.h"
@@ -33,6 +34,7 @@
33 34
34struct btrfs_trans_handle; 35struct btrfs_trans_handle;
35struct btrfs_transaction; 36struct btrfs_transaction;
37struct btrfs_pending_snapshot;
36extern struct kmem_cache *btrfs_trans_handle_cachep; 38extern struct kmem_cache *btrfs_trans_handle_cachep;
37extern struct kmem_cache *btrfs_transaction_cachep; 39extern struct kmem_cache *btrfs_transaction_cachep;
38extern struct kmem_cache *btrfs_bit_radix_cachep; 40extern struct kmem_cache *btrfs_bit_radix_cachep;
@@ -662,6 +664,7 @@ struct btrfs_csum_item {
662#define BTRFS_BLOCK_GROUP_RAID1 (1 << 4) 664#define BTRFS_BLOCK_GROUP_RAID1 (1 << 4)
663#define BTRFS_BLOCK_GROUP_DUP (1 << 5) 665#define BTRFS_BLOCK_GROUP_DUP (1 << 5)
664#define BTRFS_BLOCK_GROUP_RAID10 (1 << 6) 666#define BTRFS_BLOCK_GROUP_RAID10 (1 << 6)
667#define BTRFS_NR_RAID_TYPES 5
665 668
666struct btrfs_block_group_item { 669struct btrfs_block_group_item {
667 __le64 used; 670 __le64 used;
@@ -673,42 +676,46 @@ struct btrfs_space_info {
673 u64 flags; 676 u64 flags;
674 677
675 u64 total_bytes; /* total bytes in the space */ 678 u64 total_bytes; /* total bytes in the space */
676 u64 bytes_used; /* total bytes used on disk */ 679 u64 bytes_used; /* total bytes used,
680 this does't take mirrors into account */
677 u64 bytes_pinned; /* total bytes pinned, will be freed when the 681 u64 bytes_pinned; /* total bytes pinned, will be freed when the
678 transaction finishes */ 682 transaction finishes */
679 u64 bytes_reserved; /* total bytes the allocator has reserved for 683 u64 bytes_reserved; /* total bytes the allocator has reserved for
680 current allocations */ 684 current allocations */
681 u64 bytes_readonly; /* total bytes that are read only */ 685 u64 bytes_readonly; /* total bytes that are read only */
682 u64 bytes_super; /* total bytes reserved for the super blocks */ 686
683 u64 bytes_root; /* the number of bytes needed to commit a
684 transaction */
685 u64 bytes_may_use; /* number of bytes that may be used for 687 u64 bytes_may_use; /* number of bytes that may be used for
686 delalloc/allocations */ 688 delalloc/allocations */
687 u64 bytes_delalloc; /* number of bytes currently reserved for 689 u64 disk_used; /* total bytes used on disk */
688 delayed allocation */
689 690
690 int full; /* indicates that we cannot allocate any more 691 int full; /* indicates that we cannot allocate any more
691 chunks for this space */ 692 chunks for this space */
692 int force_alloc; /* set if we need to force a chunk alloc for 693 int force_alloc; /* set if we need to force a chunk alloc for
693 this space */ 694 this space */
694 int force_delalloc; /* make people start doing filemap_flush until
695 we're under a threshold */
696 695
697 struct list_head list; 696 struct list_head list;
698 697
699 /* for controlling how we free up space for allocations */
700 wait_queue_head_t allocate_wait;
701 wait_queue_head_t flush_wait;
702 int allocating_chunk;
703 int flushing;
704
705 /* for block groups in our same type */ 698 /* for block groups in our same type */
706 struct list_head block_groups; 699 struct list_head block_groups[BTRFS_NR_RAID_TYPES];
707 spinlock_t lock; 700 spinlock_t lock;
708 struct rw_semaphore groups_sem; 701 struct rw_semaphore groups_sem;
709 atomic_t caching_threads; 702 atomic_t caching_threads;
710}; 703};
711 704
705struct btrfs_block_rsv {
706 u64 size;
707 u64 reserved;
708 u64 freed[2];
709 struct btrfs_space_info *space_info;
710 struct list_head list;
711 spinlock_t lock;
712 atomic_t usage;
713 unsigned int priority:8;
714 unsigned int durable:1;
715 unsigned int refill_used:1;
716 unsigned int full:1;
717};
718
712/* 719/*
713 * free clusters are used to claim free space in relatively large chunks, 720 * free clusters are used to claim free space in relatively large chunks,
714 * allowing us to do less seeky writes. They are used for all metadata 721 * allowing us to do less seeky writes. They are used for all metadata
@@ -759,6 +766,7 @@ struct btrfs_block_group_cache {
759 spinlock_t lock; 766 spinlock_t lock;
760 u64 pinned; 767 u64 pinned;
761 u64 reserved; 768 u64 reserved;
769 u64 reserved_pinned;
762 u64 bytes_super; 770 u64 bytes_super;
763 u64 flags; 771 u64 flags;
764 u64 sectorsize; 772 u64 sectorsize;
@@ -824,6 +832,22 @@ struct btrfs_fs_info {
824 /* logical->physical extent mapping */ 832 /* logical->physical extent mapping */
825 struct btrfs_mapping_tree mapping_tree; 833 struct btrfs_mapping_tree mapping_tree;
826 834
835 /* block reservation for extent, checksum and root tree */
836 struct btrfs_block_rsv global_block_rsv;
837 /* block reservation for delay allocation */
838 struct btrfs_block_rsv delalloc_block_rsv;
839 /* block reservation for metadata operations */
840 struct btrfs_block_rsv trans_block_rsv;
841 /* block reservation for chunk tree */
842 struct btrfs_block_rsv chunk_block_rsv;
843
844 struct btrfs_block_rsv empty_block_rsv;
845
846 /* list of block reservations that cross multiple transactions */
847 struct list_head durable_block_rsv_list;
848
849 struct mutex durable_block_rsv_mutex;
850
827 u64 generation; 851 u64 generation;
828 u64 last_trans_committed; 852 u64 last_trans_committed;
829 853
@@ -834,7 +858,6 @@ struct btrfs_fs_info {
834 u64 last_trans_log_full_commit; 858 u64 last_trans_log_full_commit;
835 u64 open_ioctl_trans; 859 u64 open_ioctl_trans;
836 unsigned long mount_opt; 860 unsigned long mount_opt;
837 u64 max_extent;
838 u64 max_inline; 861 u64 max_inline;
839 u64 alloc_start; 862 u64 alloc_start;
840 struct btrfs_transaction *running_transaction; 863 struct btrfs_transaction *running_transaction;
@@ -927,7 +950,6 @@ struct btrfs_fs_info {
927 struct btrfs_workers endio_meta_write_workers; 950 struct btrfs_workers endio_meta_write_workers;
928 struct btrfs_workers endio_write_workers; 951 struct btrfs_workers endio_write_workers;
929 struct btrfs_workers submit_workers; 952 struct btrfs_workers submit_workers;
930 struct btrfs_workers enospc_workers;
931 /* 953 /*
932 * fixup workers take dirty pages that didn't properly go through 954 * fixup workers take dirty pages that didn't properly go through
933 * the cow mechanism and make them safe to write. It happens 955 * the cow mechanism and make them safe to write. It happens
@@ -943,6 +965,7 @@ struct btrfs_fs_info {
943 int do_barriers; 965 int do_barriers;
944 int closing; 966 int closing;
945 int log_root_recovering; 967 int log_root_recovering;
968 int enospc_unlink;
946 969
947 u64 total_pinned; 970 u64 total_pinned;
948 971
@@ -1012,6 +1035,9 @@ struct btrfs_root {
1012 struct completion kobj_unregister; 1035 struct completion kobj_unregister;
1013 struct mutex objectid_mutex; 1036 struct mutex objectid_mutex;
1014 1037
1038 spinlock_t accounting_lock;
1039 struct btrfs_block_rsv *block_rsv;
1040
1015 struct mutex log_mutex; 1041 struct mutex log_mutex;
1016 wait_queue_head_t log_writer_wait; 1042 wait_queue_head_t log_writer_wait;
1017 wait_queue_head_t log_commit_wait[2]; 1043 wait_queue_head_t log_commit_wait[2];
@@ -1043,7 +1069,6 @@ struct btrfs_root {
1043 int ref_cows; 1069 int ref_cows;
1044 int track_dirty; 1070 int track_dirty;
1045 int in_radix; 1071 int in_radix;
1046 int clean_orphans;
1047 1072
1048 u64 defrag_trans_start; 1073 u64 defrag_trans_start;
1049 struct btrfs_key defrag_progress; 1074 struct btrfs_key defrag_progress;
@@ -1057,8 +1082,11 @@ struct btrfs_root {
1057 1082
1058 struct list_head root_list; 1083 struct list_head root_list;
1059 1084
1060 spinlock_t list_lock; 1085 spinlock_t orphan_lock;
1061 struct list_head orphan_list; 1086 struct list_head orphan_list;
1087 struct btrfs_block_rsv *orphan_block_rsv;
1088 int orphan_item_inserted;
1089 int orphan_cleanup_state;
1062 1090
1063 spinlock_t inode_lock; 1091 spinlock_t inode_lock;
1064 /* red-black tree that keeps track of in-memory inodes */ 1092 /* red-black tree that keeps track of in-memory inodes */
@@ -1965,6 +1993,9 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
1965int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 1993int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
1966 struct btrfs_root *root, unsigned long count); 1994 struct btrfs_root *root, unsigned long count);
1967int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len); 1995int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
1996int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
1997 struct btrfs_root *root, u64 bytenr,
1998 u64 num_bytes, u64 *refs, u64 *flags);
1968int btrfs_pin_extent(struct btrfs_root *root, 1999int btrfs_pin_extent(struct btrfs_root *root,
1969 u64 bytenr, u64 num, int reserved); 2000 u64 bytenr, u64 num, int reserved);
1970int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, 2001int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
@@ -1984,10 +2015,10 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
1984 u64 parent, u64 root_objectid, 2015 u64 parent, u64 root_objectid,
1985 struct btrfs_disk_key *key, int level, 2016 struct btrfs_disk_key *key, int level,
1986 u64 hint, u64 empty_size); 2017 u64 hint, u64 empty_size);
1987int btrfs_free_tree_block(struct btrfs_trans_handle *trans, 2018void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
1988 struct btrfs_root *root, 2019 struct btrfs_root *root,
1989 u64 bytenr, u32 blocksize, 2020 struct extent_buffer *buf,
1990 u64 parent, u64 root_objectid, int level); 2021 u64 parent, int last_ref);
1991struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, 2022struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
1992 struct btrfs_root *root, 2023 struct btrfs_root *root,
1993 u64 bytenr, u32 blocksize, 2024 u64 bytenr, u32 blocksize,
@@ -2041,27 +2072,49 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
2041 u64 size); 2072 u64 size);
2042int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 2073int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
2043 struct btrfs_root *root, u64 group_start); 2074 struct btrfs_root *root, u64 group_start);
2044int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
2045 struct btrfs_block_group_cache *group);
2046
2047u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); 2075u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
2048void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde); 2076void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
2049void btrfs_clear_space_info_full(struct btrfs_fs_info *info); 2077void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
2050 2078int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
2051int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items); 2079void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
2052int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items); 2080int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
2053int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root, 2081 struct btrfs_root *root,
2054 struct inode *inode, int num_items); 2082 int num_items, int *retries);
2055int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root, 2083void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
2056 struct inode *inode, int num_items); 2084 struct btrfs_root *root);
2057int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, 2085int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
2058 u64 bytes); 2086 struct inode *inode);
2059void btrfs_free_reserved_data_space(struct btrfs_root *root, 2087void btrfs_orphan_release_metadata(struct inode *inode);
2060 struct inode *inode, u64 bytes); 2088int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
2061void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode, 2089 struct btrfs_pending_snapshot *pending);
2062 u64 bytes); 2090int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
2063void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode, 2091void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);
2064 u64 bytes); 2092int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes);
2093void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes);
2094void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv);
2095struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root);
2096void btrfs_free_block_rsv(struct btrfs_root *root,
2097 struct btrfs_block_rsv *rsv);
2098void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
2099 struct btrfs_block_rsv *rsv);
2100int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
2101 struct btrfs_root *root,
2102 struct btrfs_block_rsv *block_rsv,
2103 u64 num_bytes, int *retries);
2104int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
2105 struct btrfs_root *root,
2106 struct btrfs_block_rsv *block_rsv,
2107 u64 min_reserved, int min_factor);
2108int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
2109 struct btrfs_block_rsv *dst_rsv,
2110 u64 num_bytes);
2111void btrfs_block_rsv_release(struct btrfs_root *root,
2112 struct btrfs_block_rsv *block_rsv,
2113 u64 num_bytes);
2114int btrfs_set_block_group_ro(struct btrfs_root *root,
2115 struct btrfs_block_group_cache *cache);
2116int btrfs_set_block_group_rw(struct btrfs_root *root,
2117 struct btrfs_block_group_cache *cache);
2065/* ctree.c */ 2118/* ctree.c */
2066int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, 2119int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
2067 int level, int *slot); 2120 int level, int *slot);
@@ -2152,7 +2205,8 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
2152int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); 2205int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
2153int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); 2206int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
2154int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf); 2207int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
2155int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref); 2208int btrfs_drop_snapshot(struct btrfs_root *root,
2209 struct btrfs_block_rsv *block_rsv, int update_ref);
2156int btrfs_drop_subtree(struct btrfs_trans_handle *trans, 2210int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
2157 struct btrfs_root *root, 2211 struct btrfs_root *root,
2158 struct extent_buffer *node, 2212 struct extent_buffer *node,
@@ -2245,6 +2299,12 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
2245 struct btrfs_root *root, 2299 struct btrfs_root *root,
2246 const char *name, int name_len, 2300 const char *name, int name_len,
2247 u64 inode_objectid, u64 ref_objectid, u64 *index); 2301 u64 inode_objectid, u64 ref_objectid, u64 *index);
2302struct btrfs_inode_ref *
2303btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
2304 struct btrfs_root *root,
2305 struct btrfs_path *path,
2306 const char *name, int name_len,
2307 u64 inode_objectid, u64 ref_objectid, int mod);
2248int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, 2308int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
2249 struct btrfs_root *root, 2309 struct btrfs_root *root,
2250 struct btrfs_path *path, u64 objectid); 2310 struct btrfs_path *path, u64 objectid);
@@ -2257,6 +2317,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
2257 struct btrfs_root *root, u64 bytenr, u64 len); 2317 struct btrfs_root *root, u64 bytenr, u64 len);
2258int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, 2318int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
2259 struct bio *bio, u32 *dst); 2319 struct bio *bio, u32 *dst);
2320int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
2321 struct bio *bio, u64 logical_offset, u32 *dst);
2260int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, 2322int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
2261 struct btrfs_root *root, 2323 struct btrfs_root *root,
2262 u64 objectid, u64 pos, 2324 u64 objectid, u64 pos,
@@ -2311,6 +2373,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2311 u32 min_type); 2373 u32 min_type);
2312 2374
2313int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput); 2375int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
2376int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput);
2314int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, 2377int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
2315 struct extent_state **cached_state); 2378 struct extent_state **cached_state);
2316int btrfs_writepages(struct address_space *mapping, 2379int btrfs_writepages(struct address_space *mapping,
@@ -2349,10 +2412,20 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans,
2349int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode); 2412int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
2350int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode); 2413int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
2351void btrfs_orphan_cleanup(struct btrfs_root *root); 2414void btrfs_orphan_cleanup(struct btrfs_root *root);
2415void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
2416 struct btrfs_pending_snapshot *pending,
2417 u64 *bytes_to_reserve);
2418void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
2419 struct btrfs_pending_snapshot *pending);
2420void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
2421 struct btrfs_root *root);
2352int btrfs_cont_expand(struct inode *inode, loff_t size); 2422int btrfs_cont_expand(struct inode *inode, loff_t size);
2353int btrfs_invalidate_inodes(struct btrfs_root *root); 2423int btrfs_invalidate_inodes(struct btrfs_root *root);
2354void btrfs_add_delayed_iput(struct inode *inode); 2424void btrfs_add_delayed_iput(struct inode *inode);
2355void btrfs_run_delayed_iputs(struct btrfs_root *root); 2425void btrfs_run_delayed_iputs(struct btrfs_root *root);
2426int btrfs_prealloc_file_range(struct inode *inode, int mode,
2427 u64 start, u64 num_bytes, u64 min_size,
2428 loff_t actual_len, u64 *alloc_hint);
2356extern const struct dentry_operations btrfs_dentry_operations; 2429extern const struct dentry_operations btrfs_dentry_operations;
2357 2430
2358/* ioctl.c */ 2431/* ioctl.c */
@@ -2361,7 +2434,7 @@ void btrfs_update_iflags(struct inode *inode);
2361void btrfs_inherit_iflags(struct inode *inode, struct inode *dir); 2434void btrfs_inherit_iflags(struct inode *inode, struct inode *dir);
2362 2435
2363/* file.c */ 2436/* file.c */
2364int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync); 2437int btrfs_sync_file(struct file *file, int datasync);
2365int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, 2438int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
2366 int skip_pinned); 2439 int skip_pinned);
2367int btrfs_check_file(struct btrfs_root *root, struct inode *inode); 2440int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
@@ -2409,4 +2482,12 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
2409 struct btrfs_root *root); 2482 struct btrfs_root *root);
2410int btrfs_recover_relocation(struct btrfs_root *root); 2483int btrfs_recover_relocation(struct btrfs_root *root);
2411int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len); 2484int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
2485void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
2486 struct btrfs_root *root, struct extent_buffer *buf,
2487 struct extent_buffer *cow);
2488void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
2489 struct btrfs_pending_snapshot *pending,
2490 u64 *bytes_to_reserve);
2491void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
2492 struct btrfs_pending_snapshot *pending);
2412#endif 2493#endif
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 84e6781413b1..e807b143b857 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/slab.h>
20#include <linux/sort.h> 21#include <linux/sort.h>
21#include "ctree.h" 22#include "ctree.h"
22#include "delayed-ref.h" 23#include "delayed-ref.h"
@@ -318,107 +319,6 @@ out:
318} 319}
319 320
320/* 321/*
321 * helper function to lookup reference count and flags of extent.
322 *
323 * the head node for delayed ref is used to store the sum of all the
324 * reference count modifications queued up in the rbtree. the head
325 * node may also store the extent flags to set. This way you can check
326 * to see what the reference count and extent flags would be if all of
327 * the delayed refs are not processed.
328 */
329int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
330 struct btrfs_root *root, u64 bytenr,
331 u64 num_bytes, u64 *refs, u64 *flags)
332{
333 struct btrfs_delayed_ref_node *ref;
334 struct btrfs_delayed_ref_head *head;
335 struct btrfs_delayed_ref_root *delayed_refs;
336 struct btrfs_path *path;
337 struct btrfs_extent_item *ei;
338 struct extent_buffer *leaf;
339 struct btrfs_key key;
340 u32 item_size;
341 u64 num_refs;
342 u64 extent_flags;
343 int ret;
344
345 path = btrfs_alloc_path();
346 if (!path)
347 return -ENOMEM;
348
349 key.objectid = bytenr;
350 key.type = BTRFS_EXTENT_ITEM_KEY;
351 key.offset = num_bytes;
352 delayed_refs = &trans->transaction->delayed_refs;
353again:
354 ret = btrfs_search_slot(trans, root->fs_info->extent_root,
355 &key, path, 0, 0);
356 if (ret < 0)
357 goto out;
358
359 if (ret == 0) {
360 leaf = path->nodes[0];
361 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
362 if (item_size >= sizeof(*ei)) {
363 ei = btrfs_item_ptr(leaf, path->slots[0],
364 struct btrfs_extent_item);
365 num_refs = btrfs_extent_refs(leaf, ei);
366 extent_flags = btrfs_extent_flags(leaf, ei);
367 } else {
368#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
369 struct btrfs_extent_item_v0 *ei0;
370 BUG_ON(item_size != sizeof(*ei0));
371 ei0 = btrfs_item_ptr(leaf, path->slots[0],
372 struct btrfs_extent_item_v0);
373 num_refs = btrfs_extent_refs_v0(leaf, ei0);
374 /* FIXME: this isn't correct for data */
375 extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
376#else
377 BUG();
378#endif
379 }
380 BUG_ON(num_refs == 0);
381 } else {
382 num_refs = 0;
383 extent_flags = 0;
384 ret = 0;
385 }
386
387 spin_lock(&delayed_refs->lock);
388 ref = find_ref_head(&delayed_refs->root, bytenr, NULL);
389 if (ref) {
390 head = btrfs_delayed_node_to_head(ref);
391 if (!mutex_trylock(&head->mutex)) {
392 atomic_inc(&ref->refs);
393 spin_unlock(&delayed_refs->lock);
394
395 btrfs_release_path(root->fs_info->extent_root, path);
396
397 mutex_lock(&head->mutex);
398 mutex_unlock(&head->mutex);
399 btrfs_put_delayed_ref(ref);
400 goto again;
401 }
402 if (head->extent_op && head->extent_op->update_flags)
403 extent_flags |= head->extent_op->flags_to_set;
404 else
405 BUG_ON(num_refs == 0);
406
407 num_refs += ref->ref_mod;
408 mutex_unlock(&head->mutex);
409 }
410 WARN_ON(num_refs == 0);
411 if (refs)
412 *refs = num_refs;
413 if (flags)
414 *flags = extent_flags;
415out:
416 spin_unlock(&delayed_refs->lock);
417 btrfs_free_path(path);
418 return ret;
419}
420
421/*
422 * helper function to update an extent delayed ref in the 322 * helper function to update an extent delayed ref in the
423 * rbtree. existing and update must both have the same 323 * rbtree. existing and update must both have the same
424 * bytenr and parent 324 * bytenr and parent
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index f6fc67ddad36..50e3cf92fbda 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -167,9 +167,6 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
167struct btrfs_delayed_ref_head * 167struct btrfs_delayed_ref_head *
168btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr); 168btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
169int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr); 169int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr);
170int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
171 struct btrfs_root *root, u64 bytenr,
172 u64 num_bytes, u64 *refs, u64 *flags);
173int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans, 170int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
174 u64 bytenr, u64 num_bytes, u64 orig_parent, 171 u64 bytenr, u64 num_bytes, u64 orig_parent,
175 u64 parent, u64 orig_ref_root, u64 ref_root, 172 u64 parent, u64 orig_ref_root, u64 ref_root,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 11d0ad30e203..f3b287c22caf 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -27,6 +27,7 @@
27#include <linux/kthread.h> 27#include <linux/kthread.h>
28#include <linux/freezer.h> 28#include <linux/freezer.h>
29#include <linux/crc32c.h> 29#include <linux/crc32c.h>
30#include <linux/slab.h>
30#include "compat.h" 31#include "compat.h"
31#include "ctree.h" 32#include "ctree.h"
32#include "disk-io.h" 33#include "disk-io.h"
@@ -43,8 +44,6 @@ static struct extent_io_ops btree_extent_io_ops;
43static void end_workqueue_fn(struct btrfs_work *work); 44static void end_workqueue_fn(struct btrfs_work *work);
44static void free_fs_root(struct btrfs_root *root); 45static void free_fs_root(struct btrfs_root *root);
45 46
46static atomic_t btrfs_bdi_num = ATOMIC_INIT(0);
47
48/* 47/*
49 * end_io_wq structs are used to do processing in task context when an IO is 48 * end_io_wq structs are used to do processing in task context when an IO is
50 * complete. This is used during reads to verify checksums, and it is used 49 * complete. This is used during reads to verify checksums, and it is used
@@ -75,6 +74,11 @@ struct async_submit_bio {
75 int rw; 74 int rw;
76 int mirror_num; 75 int mirror_num;
77 unsigned long bio_flags; 76 unsigned long bio_flags;
77 /*
78 * bio_offset is optional, can be used if the pages in the bio
79 * can't tell us where in the file the bio should go
80 */
81 u64 bio_offset;
78 struct btrfs_work work; 82 struct btrfs_work work;
79}; 83};
80 84
@@ -535,7 +539,8 @@ static void run_one_async_start(struct btrfs_work *work)
535 async = container_of(work, struct async_submit_bio, work); 539 async = container_of(work, struct async_submit_bio, work);
536 fs_info = BTRFS_I(async->inode)->root->fs_info; 540 fs_info = BTRFS_I(async->inode)->root->fs_info;
537 async->submit_bio_start(async->inode, async->rw, async->bio, 541 async->submit_bio_start(async->inode, async->rw, async->bio,
538 async->mirror_num, async->bio_flags); 542 async->mirror_num, async->bio_flags,
543 async->bio_offset);
539} 544}
540 545
541static void run_one_async_done(struct btrfs_work *work) 546static void run_one_async_done(struct btrfs_work *work)
@@ -557,7 +562,8 @@ static void run_one_async_done(struct btrfs_work *work)
557 wake_up(&fs_info->async_submit_wait); 562 wake_up(&fs_info->async_submit_wait);
558 563
559 async->submit_bio_done(async->inode, async->rw, async->bio, 564 async->submit_bio_done(async->inode, async->rw, async->bio,
560 async->mirror_num, async->bio_flags); 565 async->mirror_num, async->bio_flags,
566 async->bio_offset);
561} 567}
562 568
563static void run_one_async_free(struct btrfs_work *work) 569static void run_one_async_free(struct btrfs_work *work)
@@ -571,6 +577,7 @@ static void run_one_async_free(struct btrfs_work *work)
571int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, 577int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
572 int rw, struct bio *bio, int mirror_num, 578 int rw, struct bio *bio, int mirror_num,
573 unsigned long bio_flags, 579 unsigned long bio_flags,
580 u64 bio_offset,
574 extent_submit_bio_hook_t *submit_bio_start, 581 extent_submit_bio_hook_t *submit_bio_start,
575 extent_submit_bio_hook_t *submit_bio_done) 582 extent_submit_bio_hook_t *submit_bio_done)
576{ 583{
@@ -593,6 +600,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
593 600
594 async->work.flags = 0; 601 async->work.flags = 0;
595 async->bio_flags = bio_flags; 602 async->bio_flags = bio_flags;
603 async->bio_offset = bio_offset;
596 604
597 atomic_inc(&fs_info->nr_async_submits); 605 atomic_inc(&fs_info->nr_async_submits);
598 606
@@ -628,7 +636,8 @@ static int btree_csum_one_bio(struct bio *bio)
628 636
629static int __btree_submit_bio_start(struct inode *inode, int rw, 637static int __btree_submit_bio_start(struct inode *inode, int rw,
630 struct bio *bio, int mirror_num, 638 struct bio *bio, int mirror_num,
631 unsigned long bio_flags) 639 unsigned long bio_flags,
640 u64 bio_offset)
632{ 641{
633 /* 642 /*
634 * when we're called for a write, we're already in the async 643 * when we're called for a write, we're already in the async
@@ -639,7 +648,8 @@ static int __btree_submit_bio_start(struct inode *inode, int rw,
639} 648}
640 649
641static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio, 650static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
642 int mirror_num, unsigned long bio_flags) 651 int mirror_num, unsigned long bio_flags,
652 u64 bio_offset)
643{ 653{
644 /* 654 /*
645 * when we're called for a write, we're already in the async 655 * when we're called for a write, we're already in the async
@@ -649,7 +659,8 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
649} 659}
650 660
651static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, 661static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
652 int mirror_num, unsigned long bio_flags) 662 int mirror_num, unsigned long bio_flags,
663 u64 bio_offset)
653{ 664{
654 int ret; 665 int ret;
655 666
@@ -672,6 +683,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
672 */ 683 */
673 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, 684 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
674 inode, rw, bio, mirror_num, 0, 685 inode, rw, bio, mirror_num, 0,
686 bio_offset,
675 __btree_submit_bio_start, 687 __btree_submit_bio_start,
676 __btree_submit_bio_done); 688 __btree_submit_bio_done);
677} 689}
@@ -895,7 +907,8 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
895 root->ref_cows = 0; 907 root->ref_cows = 0;
896 root->track_dirty = 0; 908 root->track_dirty = 0;
897 root->in_radix = 0; 909 root->in_radix = 0;
898 root->clean_orphans = 0; 910 root->orphan_item_inserted = 0;
911 root->orphan_cleanup_state = 0;
899 912
900 root->fs_info = fs_info; 913 root->fs_info = fs_info;
901 root->objectid = objectid; 914 root->objectid = objectid;
@@ -904,13 +917,16 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
904 root->name = NULL; 917 root->name = NULL;
905 root->in_sysfs = 0; 918 root->in_sysfs = 0;
906 root->inode_tree = RB_ROOT; 919 root->inode_tree = RB_ROOT;
920 root->block_rsv = NULL;
921 root->orphan_block_rsv = NULL;
907 922
908 INIT_LIST_HEAD(&root->dirty_list); 923 INIT_LIST_HEAD(&root->dirty_list);
909 INIT_LIST_HEAD(&root->orphan_list); 924 INIT_LIST_HEAD(&root->orphan_list);
910 INIT_LIST_HEAD(&root->root_list); 925 INIT_LIST_HEAD(&root->root_list);
911 spin_lock_init(&root->node_lock); 926 spin_lock_init(&root->node_lock);
912 spin_lock_init(&root->list_lock); 927 spin_lock_init(&root->orphan_lock);
913 spin_lock_init(&root->inode_lock); 928 spin_lock_init(&root->inode_lock);
929 spin_lock_init(&root->accounting_lock);
914 mutex_init(&root->objectid_mutex); 930 mutex_init(&root->objectid_mutex);
915 mutex_init(&root->log_mutex); 931 mutex_init(&root->log_mutex);
916 init_waitqueue_head(&root->log_writer_wait); 932 init_waitqueue_head(&root->log_writer_wait);
@@ -969,42 +985,6 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
969 return 0; 985 return 0;
970} 986}
971 987
972int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
973 struct btrfs_fs_info *fs_info)
974{
975 struct extent_buffer *eb;
976 struct btrfs_root *log_root_tree = fs_info->log_root_tree;
977 u64 start = 0;
978 u64 end = 0;
979 int ret;
980
981 if (!log_root_tree)
982 return 0;
983
984 while (1) {
985 ret = find_first_extent_bit(&log_root_tree->dirty_log_pages,
986 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW);
987 if (ret)
988 break;
989
990 clear_extent_bits(&log_root_tree->dirty_log_pages, start, end,
991 EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
992 }
993 eb = fs_info->log_root_tree->node;
994
995 WARN_ON(btrfs_header_level(eb) != 0);
996 WARN_ON(btrfs_header_nritems(eb) != 0);
997
998 ret = btrfs_free_reserved_extent(fs_info->tree_root,
999 eb->start, eb->len);
1000 BUG_ON(ret);
1001
1002 free_extent_buffer(eb);
1003 kfree(fs_info->log_root_tree);
1004 fs_info->log_root_tree = NULL;
1005 return 0;
1006}
1007
1008static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, 988static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
1009 struct btrfs_fs_info *fs_info) 989 struct btrfs_fs_info *fs_info)
1010{ 990{
@@ -1192,19 +1172,23 @@ again:
1192 if (root) 1172 if (root)
1193 return root; 1173 return root;
1194 1174
1195 ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
1196 if (ret == 0)
1197 ret = -ENOENT;
1198 if (ret < 0)
1199 return ERR_PTR(ret);
1200
1201 root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location); 1175 root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location);
1202 if (IS_ERR(root)) 1176 if (IS_ERR(root))
1203 return root; 1177 return root;
1204 1178
1205 WARN_ON(btrfs_root_refs(&root->root_item) == 0);
1206 set_anon_super(&root->anon_super, NULL); 1179 set_anon_super(&root->anon_super, NULL);
1207 1180
1181 if (btrfs_root_refs(&root->root_item) == 0) {
1182 ret = -ENOENT;
1183 goto fail;
1184 }
1185
1186 ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
1187 if (ret < 0)
1188 goto fail;
1189 if (ret == 0)
1190 root->orphan_item_inserted = 1;
1191
1208 ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); 1192 ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
1209 if (ret) 1193 if (ret)
1210 goto fail; 1194 goto fail;
@@ -1213,10 +1197,9 @@ again:
1213 ret = radix_tree_insert(&fs_info->fs_roots_radix, 1197 ret = radix_tree_insert(&fs_info->fs_roots_radix,
1214 (unsigned long)root->root_key.objectid, 1198 (unsigned long)root->root_key.objectid,
1215 root); 1199 root);
1216 if (ret == 0) { 1200 if (ret == 0)
1217 root->in_radix = 1; 1201 root->in_radix = 1;
1218 root->clean_orphans = 1; 1202
1219 }
1220 spin_unlock(&fs_info->fs_roots_radix_lock); 1203 spin_unlock(&fs_info->fs_roots_radix_lock);
1221 radix_tree_preload_end(); 1204 radix_tree_preload_end();
1222 if (ret) { 1205 if (ret) {
@@ -1374,19 +1357,11 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
1374{ 1357{
1375 int err; 1358 int err;
1376 1359
1377 bdi->name = "btrfs";
1378 bdi->capabilities = BDI_CAP_MAP_COPY; 1360 bdi->capabilities = BDI_CAP_MAP_COPY;
1379 err = bdi_init(bdi); 1361 err = bdi_setup_and_register(bdi, "btrfs", BDI_CAP_MAP_COPY);
1380 if (err) 1362 if (err)
1381 return err; 1363 return err;
1382 1364
1383 err = bdi_register(bdi, NULL, "btrfs-%d",
1384 atomic_inc_return(&btrfs_bdi_num));
1385 if (err) {
1386 bdi_destroy(bdi);
1387 return err;
1388 }
1389
1390 bdi->ra_pages = default_backing_dev_info.ra_pages; 1365 bdi->ra_pages = default_backing_dev_info.ra_pages;
1391 bdi->unplug_io_fn = btrfs_unplug_io_fn; 1366 bdi->unplug_io_fn = btrfs_unplug_io_fn;
1392 bdi->unplug_io_data = info; 1367 bdi->unplug_io_data = info;
@@ -1470,10 +1445,6 @@ static int cleaner_kthread(void *arg)
1470 struct btrfs_root *root = arg; 1445 struct btrfs_root *root = arg;
1471 1446
1472 do { 1447 do {
1473 smp_mb();
1474 if (root->fs_info->closing)
1475 break;
1476
1477 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); 1448 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
1478 1449
1479 if (!(root->fs_info->sb->s_flags & MS_RDONLY) && 1450 if (!(root->fs_info->sb->s_flags & MS_RDONLY) &&
@@ -1486,11 +1457,9 @@ static int cleaner_kthread(void *arg)
1486 if (freezing(current)) { 1457 if (freezing(current)) {
1487 refrigerator(); 1458 refrigerator();
1488 } else { 1459 } else {
1489 smp_mb();
1490 if (root->fs_info->closing)
1491 break;
1492 set_current_state(TASK_INTERRUPTIBLE); 1460 set_current_state(TASK_INTERRUPTIBLE);
1493 schedule(); 1461 if (!kthread_should_stop())
1462 schedule();
1494 __set_current_state(TASK_RUNNING); 1463 __set_current_state(TASK_RUNNING);
1495 } 1464 }
1496 } while (!kthread_should_stop()); 1465 } while (!kthread_should_stop());
@@ -1502,36 +1471,40 @@ static int transaction_kthread(void *arg)
1502 struct btrfs_root *root = arg; 1471 struct btrfs_root *root = arg;
1503 struct btrfs_trans_handle *trans; 1472 struct btrfs_trans_handle *trans;
1504 struct btrfs_transaction *cur; 1473 struct btrfs_transaction *cur;
1474 u64 transid;
1505 unsigned long now; 1475 unsigned long now;
1506 unsigned long delay; 1476 unsigned long delay;
1507 int ret; 1477 int ret;
1508 1478
1509 do { 1479 do {
1510 smp_mb();
1511 if (root->fs_info->closing)
1512 break;
1513
1514 delay = HZ * 30; 1480 delay = HZ * 30;
1515 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); 1481 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
1516 mutex_lock(&root->fs_info->transaction_kthread_mutex); 1482 mutex_lock(&root->fs_info->transaction_kthread_mutex);
1517 1483
1518 mutex_lock(&root->fs_info->trans_mutex); 1484 spin_lock(&root->fs_info->new_trans_lock);
1519 cur = root->fs_info->running_transaction; 1485 cur = root->fs_info->running_transaction;
1520 if (!cur) { 1486 if (!cur) {
1521 mutex_unlock(&root->fs_info->trans_mutex); 1487 spin_unlock(&root->fs_info->new_trans_lock);
1522 goto sleep; 1488 goto sleep;
1523 } 1489 }
1524 1490
1525 now = get_seconds(); 1491 now = get_seconds();
1526 if (now < cur->start_time || now - cur->start_time < 30) { 1492 if (!cur->blocked &&
1527 mutex_unlock(&root->fs_info->trans_mutex); 1493 (now < cur->start_time || now - cur->start_time < 30)) {
1494 spin_unlock(&root->fs_info->new_trans_lock);
1528 delay = HZ * 5; 1495 delay = HZ * 5;
1529 goto sleep; 1496 goto sleep;
1530 } 1497 }
1531 mutex_unlock(&root->fs_info->trans_mutex); 1498 transid = cur->transid;
1532 trans = btrfs_start_transaction(root, 1); 1499 spin_unlock(&root->fs_info->new_trans_lock);
1533 ret = btrfs_commit_transaction(trans, root);
1534 1500
1501 trans = btrfs_join_transaction(root, 1);
1502 if (transid == trans->transid) {
1503 ret = btrfs_commit_transaction(trans, root);
1504 BUG_ON(ret);
1505 } else {
1506 btrfs_end_transaction(trans, root);
1507 }
1535sleep: 1508sleep:
1536 wake_up_process(root->fs_info->cleaner_kthread); 1509 wake_up_process(root->fs_info->cleaner_kthread);
1537 mutex_unlock(&root->fs_info->transaction_kthread_mutex); 1510 mutex_unlock(&root->fs_info->transaction_kthread_mutex);
@@ -1539,10 +1512,10 @@ sleep:
1539 if (freezing(current)) { 1512 if (freezing(current)) {
1540 refrigerator(); 1513 refrigerator();
1541 } else { 1514 } else {
1542 if (root->fs_info->closing)
1543 break;
1544 set_current_state(TASK_INTERRUPTIBLE); 1515 set_current_state(TASK_INTERRUPTIBLE);
1545 schedule_timeout(delay); 1516 if (!kthread_should_stop() &&
1517 !btrfs_transaction_blocked(root->fs_info))
1518 schedule_timeout(delay);
1546 __set_current_state(TASK_RUNNING); 1519 __set_current_state(TASK_RUNNING);
1547 } 1520 }
1548 } while (!kthread_should_stop()); 1521 } while (!kthread_should_stop());
@@ -1629,12 +1602,18 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1629 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); 1602 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
1630 INIT_LIST_HEAD(&fs_info->space_info); 1603 INIT_LIST_HEAD(&fs_info->space_info);
1631 btrfs_mapping_init(&fs_info->mapping_tree); 1604 btrfs_mapping_init(&fs_info->mapping_tree);
1605 btrfs_init_block_rsv(&fs_info->global_block_rsv);
1606 btrfs_init_block_rsv(&fs_info->delalloc_block_rsv);
1607 btrfs_init_block_rsv(&fs_info->trans_block_rsv);
1608 btrfs_init_block_rsv(&fs_info->chunk_block_rsv);
1609 btrfs_init_block_rsv(&fs_info->empty_block_rsv);
1610 INIT_LIST_HEAD(&fs_info->durable_block_rsv_list);
1611 mutex_init(&fs_info->durable_block_rsv_mutex);
1632 atomic_set(&fs_info->nr_async_submits, 0); 1612 atomic_set(&fs_info->nr_async_submits, 0);
1633 atomic_set(&fs_info->async_delalloc_pages, 0); 1613 atomic_set(&fs_info->async_delalloc_pages, 0);
1634 atomic_set(&fs_info->async_submit_draining, 0); 1614 atomic_set(&fs_info->async_submit_draining, 0);
1635 atomic_set(&fs_info->nr_async_bios, 0); 1615 atomic_set(&fs_info->nr_async_bios, 0);
1636 fs_info->sb = sb; 1616 fs_info->sb = sb;
1637 fs_info->max_extent = (u64)-1;
1638 fs_info->max_inline = 8192 * 1024; 1617 fs_info->max_inline = 8192 * 1024;
1639 fs_info->metadata_ratio = 0; 1618 fs_info->metadata_ratio = 0;
1640 1619
@@ -1769,9 +1748,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1769 min_t(u64, fs_devices->num_devices, 1748 min_t(u64, fs_devices->num_devices,
1770 fs_info->thread_pool_size), 1749 fs_info->thread_pool_size),
1771 &fs_info->generic_worker); 1750 &fs_info->generic_worker);
1772 btrfs_init_workers(&fs_info->enospc_workers, "enospc",
1773 fs_info->thread_pool_size,
1774 &fs_info->generic_worker);
1775 1751
1776 /* a higher idle thresh on the submit workers makes it much more 1752 /* a higher idle thresh on the submit workers makes it much more
1777 * likely that bios will be send down in a sane order to the 1753 * likely that bios will be send down in a sane order to the
@@ -1819,7 +1795,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1819 btrfs_start_workers(&fs_info->endio_meta_workers, 1); 1795 btrfs_start_workers(&fs_info->endio_meta_workers, 1);
1820 btrfs_start_workers(&fs_info->endio_meta_write_workers, 1); 1796 btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
1821 btrfs_start_workers(&fs_info->endio_write_workers, 1); 1797 btrfs_start_workers(&fs_info->endio_write_workers, 1);
1822 btrfs_start_workers(&fs_info->enospc_workers, 1);
1823 1798
1824 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); 1799 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
1825 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, 1800 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -1922,17 +1897,22 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1922 1897
1923 csum_root->track_dirty = 1; 1898 csum_root->track_dirty = 1;
1924 1899
1925 btrfs_read_block_groups(extent_root);
1926
1927 fs_info->generation = generation; 1900 fs_info->generation = generation;
1928 fs_info->last_trans_committed = generation; 1901 fs_info->last_trans_committed = generation;
1929 fs_info->data_alloc_profile = (u64)-1; 1902 fs_info->data_alloc_profile = (u64)-1;
1930 fs_info->metadata_alloc_profile = (u64)-1; 1903 fs_info->metadata_alloc_profile = (u64)-1;
1931 fs_info->system_alloc_profile = fs_info->metadata_alloc_profile; 1904 fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
1905
1906 ret = btrfs_read_block_groups(extent_root);
1907 if (ret) {
1908 printk(KERN_ERR "Failed to read block groups: %d\n", ret);
1909 goto fail_block_groups;
1910 }
1911
1932 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, 1912 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
1933 "btrfs-cleaner"); 1913 "btrfs-cleaner");
1934 if (IS_ERR(fs_info->cleaner_kthread)) 1914 if (IS_ERR(fs_info->cleaner_kthread))
1935 goto fail_csum_root; 1915 goto fail_block_groups;
1936 1916
1937 fs_info->transaction_kthread = kthread_run(transaction_kthread, 1917 fs_info->transaction_kthread = kthread_run(transaction_kthread,
1938 tree_root, 1918 tree_root,
@@ -1983,6 +1963,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1983 BUG_ON(ret); 1963 BUG_ON(ret);
1984 1964
1985 if (!(sb->s_flags & MS_RDONLY)) { 1965 if (!(sb->s_flags & MS_RDONLY)) {
1966 ret = btrfs_cleanup_fs_roots(fs_info);
1967 BUG_ON(ret);
1968
1986 ret = btrfs_recover_relocation(tree_root); 1969 ret = btrfs_recover_relocation(tree_root);
1987 if (ret < 0) { 1970 if (ret < 0) {
1988 printk(KERN_WARNING 1971 printk(KERN_WARNING
@@ -2020,7 +2003,8 @@ fail_cleaner:
2020 filemap_write_and_wait(fs_info->btree_inode->i_mapping); 2003 filemap_write_and_wait(fs_info->btree_inode->i_mapping);
2021 invalidate_inode_pages2(fs_info->btree_inode->i_mapping); 2004 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
2022 2005
2023fail_csum_root: 2006fail_block_groups:
2007 btrfs_free_block_groups(fs_info);
2024 free_extent_buffer(csum_root->node); 2008 free_extent_buffer(csum_root->node);
2025 free_extent_buffer(csum_root->commit_root); 2009 free_extent_buffer(csum_root->commit_root);
2026fail_dev_root: 2010fail_dev_root:
@@ -2045,7 +2029,6 @@ fail_sb_buffer:
2045 btrfs_stop_workers(&fs_info->endio_meta_write_workers); 2029 btrfs_stop_workers(&fs_info->endio_meta_write_workers);
2046 btrfs_stop_workers(&fs_info->endio_write_workers); 2030 btrfs_stop_workers(&fs_info->endio_write_workers);
2047 btrfs_stop_workers(&fs_info->submit_workers); 2031 btrfs_stop_workers(&fs_info->submit_workers);
2048 btrfs_stop_workers(&fs_info->enospc_workers);
2049fail_iput: 2032fail_iput:
2050 invalidate_inode_pages2(fs_info->btree_inode->i_mapping); 2033 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
2051 iput(fs_info->btree_inode); 2034 iput(fs_info->btree_inode);
@@ -2410,11 +2393,11 @@ int btrfs_commit_super(struct btrfs_root *root)
2410 down_write(&root->fs_info->cleanup_work_sem); 2393 down_write(&root->fs_info->cleanup_work_sem);
2411 up_write(&root->fs_info->cleanup_work_sem); 2394 up_write(&root->fs_info->cleanup_work_sem);
2412 2395
2413 trans = btrfs_start_transaction(root, 1); 2396 trans = btrfs_join_transaction(root, 1);
2414 ret = btrfs_commit_transaction(trans, root); 2397 ret = btrfs_commit_transaction(trans, root);
2415 BUG_ON(ret); 2398 BUG_ON(ret);
2416 /* run commit again to drop the original snapshot */ 2399 /* run commit again to drop the original snapshot */
2417 trans = btrfs_start_transaction(root, 1); 2400 trans = btrfs_join_transaction(root, 1);
2418 btrfs_commit_transaction(trans, root); 2401 btrfs_commit_transaction(trans, root);
2419 ret = btrfs_write_and_wait_transaction(NULL, root); 2402 ret = btrfs_write_and_wait_transaction(NULL, root);
2420 BUG_ON(ret); 2403 BUG_ON(ret);
@@ -2431,15 +2414,15 @@ int close_ctree(struct btrfs_root *root)
2431 fs_info->closing = 1; 2414 fs_info->closing = 1;
2432 smp_mb(); 2415 smp_mb();
2433 2416
2434 kthread_stop(root->fs_info->transaction_kthread);
2435 kthread_stop(root->fs_info->cleaner_kthread);
2436
2437 if (!(fs_info->sb->s_flags & MS_RDONLY)) { 2417 if (!(fs_info->sb->s_flags & MS_RDONLY)) {
2438 ret = btrfs_commit_super(root); 2418 ret = btrfs_commit_super(root);
2439 if (ret) 2419 if (ret)
2440 printk(KERN_ERR "btrfs: commit super ret %d\n", ret); 2420 printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
2441 } 2421 }
2442 2422
2423 kthread_stop(root->fs_info->transaction_kthread);
2424 kthread_stop(root->fs_info->cleaner_kthread);
2425
2443 fs_info->closing = 2; 2426 fs_info->closing = 2;
2444 smp_mb(); 2427 smp_mb();
2445 2428
@@ -2478,7 +2461,6 @@ int close_ctree(struct btrfs_root *root)
2478 btrfs_stop_workers(&fs_info->endio_meta_write_workers); 2461 btrfs_stop_workers(&fs_info->endio_meta_write_workers);
2479 btrfs_stop_workers(&fs_info->endio_write_workers); 2462 btrfs_stop_workers(&fs_info->endio_write_workers);
2480 btrfs_stop_workers(&fs_info->submit_workers); 2463 btrfs_stop_workers(&fs_info->submit_workers);
2481 btrfs_stop_workers(&fs_info->enospc_workers);
2482 2464
2483 btrfs_close_devices(fs_info->fs_devices); 2465 btrfs_close_devices(fs_info->fs_devices);
2484 btrfs_mapping_tree_free(&fs_info->mapping_tree); 2466 btrfs_mapping_tree_free(&fs_info->mapping_tree);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index c958ecbc1916..88e825a0bf21 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -87,7 +87,7 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
87 int metadata); 87 int metadata);
88int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, 88int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
89 int rw, struct bio *bio, int mirror_num, 89 int rw, struct bio *bio, int mirror_num,
90 unsigned long bio_flags, 90 unsigned long bio_flags, u64 bio_offset,
91 extent_submit_bio_hook_t *submit_bio_start, 91 extent_submit_bio_hook_t *submit_bio_start,
92 extent_submit_bio_hook_t *submit_bio_done); 92 extent_submit_bio_hook_t *submit_bio_done);
93 93
@@ -95,8 +95,6 @@ int btrfs_congested_async(struct btrfs_fs_info *info, int iodone);
95unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info); 95unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
96int btrfs_write_tree_block(struct extent_buffer *buf); 96int btrfs_write_tree_block(struct extent_buffer *buf);
97int btrfs_wait_tree_block_writeback(struct extent_buffer *buf); 97int btrfs_wait_tree_block_writeback(struct extent_buffer *buf);
98int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
99 struct btrfs_fs_info *fs_info);
100int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, 98int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
101 struct btrfs_fs_info *fs_info); 99 struct btrfs_fs_info *fs_info);
102int btrfs_add_log_tree(struct btrfs_trans_handle *trans, 100int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 1727b26fb194..b9080d71991a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -22,6 +22,7 @@
22#include <linux/sort.h> 22#include <linux/sort.h>
23#include <linux/rcupdate.h> 23#include <linux/rcupdate.h>
24#include <linux/kthread.h> 24#include <linux/kthread.h>
25#include <linux/slab.h>
25#include "compat.h" 26#include "compat.h"
26#include "hash.h" 27#include "hash.h"
27#include "ctree.h" 28#include "ctree.h"
@@ -34,10 +35,9 @@
34 35
35static int update_block_group(struct btrfs_trans_handle *trans, 36static int update_block_group(struct btrfs_trans_handle *trans,
36 struct btrfs_root *root, 37 struct btrfs_root *root,
37 u64 bytenr, u64 num_bytes, int alloc, 38 u64 bytenr, u64 num_bytes, int alloc);
38 int mark_free); 39static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
39static int update_reserved_extents(struct btrfs_block_group_cache *cache, 40 u64 num_bytes, int reserve, int sinfo);
40 u64 num_bytes, int reserve);
41static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 41static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
42 struct btrfs_root *root, 42 struct btrfs_root *root,
43 u64 bytenr, u64 num_bytes, u64 parent, 43 u64 bytenr, u64 num_bytes, u64 parent,
@@ -60,12 +60,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
60static int do_chunk_alloc(struct btrfs_trans_handle *trans, 60static int do_chunk_alloc(struct btrfs_trans_handle *trans,
61 struct btrfs_root *extent_root, u64 alloc_bytes, 61 struct btrfs_root *extent_root, u64 alloc_bytes,
62 u64 flags, int force); 62 u64 flags, int force);
63static int pin_down_bytes(struct btrfs_trans_handle *trans,
64 struct btrfs_root *root,
65 struct btrfs_path *path,
66 u64 bytenr, u64 num_bytes,
67 int is_data, int reserved,
68 struct extent_buffer **must_clean);
69static int find_next_key(struct btrfs_path *path, int level, 63static int find_next_key(struct btrfs_path *path, int level,
70 struct btrfs_key *key); 64 struct btrfs_key *key);
71static void dump_space_info(struct btrfs_space_info *info, u64 bytes, 65static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
@@ -90,8 +84,12 @@ void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
90 84
91void btrfs_put_block_group(struct btrfs_block_group_cache *cache) 85void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
92{ 86{
93 if (atomic_dec_and_test(&cache->count)) 87 if (atomic_dec_and_test(&cache->count)) {
88 WARN_ON(cache->pinned > 0);
89 WARN_ON(cache->reserved > 0);
90 WARN_ON(cache->reserved_pinned > 0);
94 kfree(cache); 91 kfree(cache);
92 }
95} 93}
96 94
97/* 95/*
@@ -318,7 +316,7 @@ static int caching_kthread(void *data)
318 316
319 exclude_super_stripes(extent_root, block_group); 317 exclude_super_stripes(extent_root, block_group);
320 spin_lock(&block_group->space_info->lock); 318 spin_lock(&block_group->space_info->lock);
321 block_group->space_info->bytes_super += block_group->bytes_super; 319 block_group->space_info->bytes_readonly += block_group->bytes_super;
322 spin_unlock(&block_group->space_info->lock); 320 spin_unlock(&block_group->space_info->lock);
323 321
324 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); 322 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
@@ -506,6 +504,9 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
506 struct list_head *head = &info->space_info; 504 struct list_head *head = &info->space_info;
507 struct btrfs_space_info *found; 505 struct btrfs_space_info *found;
508 506
507 flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM |
508 BTRFS_BLOCK_GROUP_METADATA;
509
509 rcu_read_lock(); 510 rcu_read_lock();
510 list_for_each_entry_rcu(found, head, list) { 511 list_for_each_entry_rcu(found, head, list) {
511 if (found->flags == flags) { 512 if (found->flags == flags) {
@@ -609,6 +610,113 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
609} 610}
610 611
611/* 612/*
613 * helper function to lookup reference count and flags of extent.
614 *
615 * the head node for delayed ref is used to store the sum of all the
616 * reference count modifications queued up in the rbtree. the head
617 * node may also store the extent flags to set. This way you can check
618 * to see what the reference count and extent flags would be if all of
619 * the delayed refs are not processed.
620 */
621int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
622 struct btrfs_root *root, u64 bytenr,
623 u64 num_bytes, u64 *refs, u64 *flags)
624{
625 struct btrfs_delayed_ref_head *head;
626 struct btrfs_delayed_ref_root *delayed_refs;
627 struct btrfs_path *path;
628 struct btrfs_extent_item *ei;
629 struct extent_buffer *leaf;
630 struct btrfs_key key;
631 u32 item_size;
632 u64 num_refs;
633 u64 extent_flags;
634 int ret;
635
636 path = btrfs_alloc_path();
637 if (!path)
638 return -ENOMEM;
639
640 key.objectid = bytenr;
641 key.type = BTRFS_EXTENT_ITEM_KEY;
642 key.offset = num_bytes;
643 if (!trans) {
644 path->skip_locking = 1;
645 path->search_commit_root = 1;
646 }
647again:
648 ret = btrfs_search_slot(trans, root->fs_info->extent_root,
649 &key, path, 0, 0);
650 if (ret < 0)
651 goto out_free;
652
653 if (ret == 0) {
654 leaf = path->nodes[0];
655 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
656 if (item_size >= sizeof(*ei)) {
657 ei = btrfs_item_ptr(leaf, path->slots[0],
658 struct btrfs_extent_item);
659 num_refs = btrfs_extent_refs(leaf, ei);
660 extent_flags = btrfs_extent_flags(leaf, ei);
661 } else {
662#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
663 struct btrfs_extent_item_v0 *ei0;
664 BUG_ON(item_size != sizeof(*ei0));
665 ei0 = btrfs_item_ptr(leaf, path->slots[0],
666 struct btrfs_extent_item_v0);
667 num_refs = btrfs_extent_refs_v0(leaf, ei0);
668 /* FIXME: this isn't correct for data */
669 extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
670#else
671 BUG();
672#endif
673 }
674 BUG_ON(num_refs == 0);
675 } else {
676 num_refs = 0;
677 extent_flags = 0;
678 ret = 0;
679 }
680
681 if (!trans)
682 goto out;
683
684 delayed_refs = &trans->transaction->delayed_refs;
685 spin_lock(&delayed_refs->lock);
686 head = btrfs_find_delayed_ref_head(trans, bytenr);
687 if (head) {
688 if (!mutex_trylock(&head->mutex)) {
689 atomic_inc(&head->node.refs);
690 spin_unlock(&delayed_refs->lock);
691
692 btrfs_release_path(root->fs_info->extent_root, path);
693
694 mutex_lock(&head->mutex);
695 mutex_unlock(&head->mutex);
696 btrfs_put_delayed_ref(&head->node);
697 goto again;
698 }
699 if (head->extent_op && head->extent_op->update_flags)
700 extent_flags |= head->extent_op->flags_to_set;
701 else
702 BUG_ON(num_refs == 0);
703
704 num_refs += head->node.ref_mod;
705 mutex_unlock(&head->mutex);
706 }
707 spin_unlock(&delayed_refs->lock);
708out:
709 WARN_ON(num_refs == 0);
710 if (refs)
711 *refs = num_refs;
712 if (flags)
713 *flags = extent_flags;
714out_free:
715 btrfs_free_path(path);
716 return ret;
717}
718
719/*
612 * Back reference rules. Back refs have three main goals: 720 * Back reference rules. Back refs have three main goals:
613 * 721 *
614 * 1) differentiate between all holders of references to an extent so that 722 * 1) differentiate between all holders of references to an extent so that
@@ -1588,7 +1696,7 @@ static void btrfs_issue_discard(struct block_device *bdev,
1588 u64 start, u64 len) 1696 u64 start, u64 len)
1589{ 1697{
1590 blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, 1698 blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
1591 DISCARD_FL_BARRIER); 1699 BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
1592} 1700}
1593 1701
1594static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, 1702static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
@@ -1870,7 +1978,6 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
1870 return ret; 1978 return ret;
1871} 1979}
1872 1980
1873
1874/* helper function to actually process a single delayed ref entry */ 1981/* helper function to actually process a single delayed ref entry */
1875static int run_one_delayed_ref(struct btrfs_trans_handle *trans, 1982static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
1876 struct btrfs_root *root, 1983 struct btrfs_root *root,
@@ -1890,32 +1997,14 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
1890 BUG_ON(extent_op); 1997 BUG_ON(extent_op);
1891 head = btrfs_delayed_node_to_head(node); 1998 head = btrfs_delayed_node_to_head(node);
1892 if (insert_reserved) { 1999 if (insert_reserved) {
1893 int mark_free = 0; 2000 btrfs_pin_extent(root, node->bytenr,
1894 struct extent_buffer *must_clean = NULL; 2001 node->num_bytes, 1);
1895
1896 ret = pin_down_bytes(trans, root, NULL,
1897 node->bytenr, node->num_bytes,
1898 head->is_data, 1, &must_clean);
1899 if (ret > 0)
1900 mark_free = 1;
1901
1902 if (must_clean) {
1903 clean_tree_block(NULL, root, must_clean);
1904 btrfs_tree_unlock(must_clean);
1905 free_extent_buffer(must_clean);
1906 }
1907 if (head->is_data) { 2002 if (head->is_data) {
1908 ret = btrfs_del_csums(trans, root, 2003 ret = btrfs_del_csums(trans, root,
1909 node->bytenr, 2004 node->bytenr,
1910 node->num_bytes); 2005 node->num_bytes);
1911 BUG_ON(ret); 2006 BUG_ON(ret);
1912 } 2007 }
1913 if (mark_free) {
1914 ret = btrfs_free_reserved_extent(root,
1915 node->bytenr,
1916 node->num_bytes);
1917 BUG_ON(ret);
1918 }
1919 } 2008 }
1920 mutex_unlock(&head->mutex); 2009 mutex_unlock(&head->mutex);
1921 return 0; 2010 return 0;
@@ -2346,6 +2435,8 @@ int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
2346 ret = 0; 2435 ret = 0;
2347out: 2436out:
2348 btrfs_free_path(path); 2437 btrfs_free_path(path);
2438 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
2439 WARN_ON(ret > 0);
2349 return ret; 2440 return ret;
2350} 2441}
2351 2442
@@ -2659,12 +2750,21 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
2659 struct btrfs_space_info **space_info) 2750 struct btrfs_space_info **space_info)
2660{ 2751{
2661 struct btrfs_space_info *found; 2752 struct btrfs_space_info *found;
2753 int i;
2754 int factor;
2755
2756 if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
2757 BTRFS_BLOCK_GROUP_RAID10))
2758 factor = 2;
2759 else
2760 factor = 1;
2662 2761
2663 found = __find_space_info(info, flags); 2762 found = __find_space_info(info, flags);
2664 if (found) { 2763 if (found) {
2665 spin_lock(&found->lock); 2764 spin_lock(&found->lock);
2666 found->total_bytes += total_bytes; 2765 found->total_bytes += total_bytes;
2667 found->bytes_used += bytes_used; 2766 found->bytes_used += bytes_used;
2767 found->disk_used += bytes_used * factor;
2668 found->full = 0; 2768 found->full = 0;
2669 spin_unlock(&found->lock); 2769 spin_unlock(&found->lock);
2670 *space_info = found; 2770 *space_info = found;
@@ -2674,16 +2774,20 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
2674 if (!found) 2774 if (!found)
2675 return -ENOMEM; 2775 return -ENOMEM;
2676 2776
2677 INIT_LIST_HEAD(&found->block_groups); 2777 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
2778 INIT_LIST_HEAD(&found->block_groups[i]);
2678 init_rwsem(&found->groups_sem); 2779 init_rwsem(&found->groups_sem);
2679 spin_lock_init(&found->lock); 2780 spin_lock_init(&found->lock);
2680 found->flags = flags; 2781 found->flags = flags & (BTRFS_BLOCK_GROUP_DATA |
2782 BTRFS_BLOCK_GROUP_SYSTEM |
2783 BTRFS_BLOCK_GROUP_METADATA);
2681 found->total_bytes = total_bytes; 2784 found->total_bytes = total_bytes;
2682 found->bytes_used = bytes_used; 2785 found->bytes_used = bytes_used;
2786 found->disk_used = bytes_used * factor;
2683 found->bytes_pinned = 0; 2787 found->bytes_pinned = 0;
2684 found->bytes_reserved = 0; 2788 found->bytes_reserved = 0;
2685 found->bytes_readonly = 0; 2789 found->bytes_readonly = 0;
2686 found->bytes_delalloc = 0; 2790 found->bytes_may_use = 0;
2687 found->full = 0; 2791 found->full = 0;
2688 found->force_alloc = 0; 2792 found->force_alloc = 0;
2689 *space_info = found; 2793 *space_info = found;
@@ -2708,19 +2812,6 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
2708 } 2812 }
2709} 2813}
2710 2814
2711static void set_block_group_readonly(struct btrfs_block_group_cache *cache)
2712{
2713 spin_lock(&cache->space_info->lock);
2714 spin_lock(&cache->lock);
2715 if (!cache->ro) {
2716 cache->space_info->bytes_readonly += cache->key.offset -
2717 btrfs_block_group_used(&cache->item);
2718 cache->ro = 1;
2719 }
2720 spin_unlock(&cache->lock);
2721 spin_unlock(&cache->space_info->lock);
2722}
2723
2724u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) 2815u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
2725{ 2816{
2726 u64 num_devices = root->fs_info->fs_devices->rw_devices; 2817 u64 num_devices = root->fs_info->fs_devices->rw_devices;
@@ -2749,492 +2840,49 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
2749 return flags; 2840 return flags;
2750} 2841}
2751 2842
2752static u64 btrfs_get_alloc_profile(struct btrfs_root *root, u64 data) 2843static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
2753{ 2844{
2754 struct btrfs_fs_info *info = root->fs_info; 2845 if (flags & BTRFS_BLOCK_GROUP_DATA)
2755 u64 alloc_profile; 2846 flags |= root->fs_info->avail_data_alloc_bits &
2756 2847 root->fs_info->data_alloc_profile;
2757 if (data) { 2848 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
2758 alloc_profile = info->avail_data_alloc_bits & 2849 flags |= root->fs_info->avail_system_alloc_bits &
2759 info->data_alloc_profile; 2850 root->fs_info->system_alloc_profile;
2760 data = BTRFS_BLOCK_GROUP_DATA | alloc_profile; 2851 else if (flags & BTRFS_BLOCK_GROUP_METADATA)
2761 } else if (root == root->fs_info->chunk_root) { 2852 flags |= root->fs_info->avail_metadata_alloc_bits &
2762 alloc_profile = info->avail_system_alloc_bits & 2853 root->fs_info->metadata_alloc_profile;
2763 info->system_alloc_profile; 2854 return btrfs_reduce_alloc_profile(root, flags);
2764 data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
2765 } else {
2766 alloc_profile = info->avail_metadata_alloc_bits &
2767 info->metadata_alloc_profile;
2768 data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
2769 }
2770
2771 return btrfs_reduce_alloc_profile(root, data);
2772} 2855}
2773 2856
2774void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode) 2857static u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
2775{ 2858{
2776 u64 alloc_target; 2859 u64 flags;
2777
2778 alloc_target = btrfs_get_alloc_profile(root, 1);
2779 BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
2780 alloc_target);
2781}
2782
2783static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items)
2784{
2785 u64 num_bytes;
2786 int level;
2787
2788 level = BTRFS_MAX_LEVEL - 2;
2789 /*
2790 * NOTE: these calculations are absolutely the worst possible case.
2791 * This assumes that _every_ item we insert will require a new leaf, and
2792 * that the tree has grown to its maximum level size.
2793 */
2794
2795 /*
2796 * for every item we insert we could insert both an extent item and a
2797 * extent ref item. Then for ever item we insert, we will need to cow
2798 * both the original leaf, plus the leaf to the left and right of it.
2799 *
2800 * Unless we are talking about the extent root, then we just want the
2801 * number of items * 2, since we just need the extent item plus its ref.
2802 */
2803 if (root == root->fs_info->extent_root)
2804 num_bytes = num_items * 2;
2805 else
2806 num_bytes = (num_items + (2 * num_items)) * 3;
2807
2808 /*
2809 * num_bytes is total number of leaves we could need times the leaf
2810 * size, and then for every leaf we could end up cow'ing 2 nodes per
2811 * level, down to the leaf level.
2812 */
2813 num_bytes = (num_bytes * root->leafsize) +
2814 (num_bytes * (level * 2)) * root->nodesize;
2815
2816 return num_bytes;
2817}
2818
2819/*
2820 * Unreserve metadata space for delalloc. If we have less reserved credits than
2821 * we have extents, this function does nothing.
2822 */
2823int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
2824 struct inode *inode, int num_items)
2825{
2826 struct btrfs_fs_info *info = root->fs_info;
2827 struct btrfs_space_info *meta_sinfo;
2828 u64 num_bytes;
2829 u64 alloc_target;
2830 bool bug = false;
2831
2832 /* get the space info for where the metadata will live */
2833 alloc_target = btrfs_get_alloc_profile(root, 0);
2834 meta_sinfo = __find_space_info(info, alloc_target);
2835
2836 num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
2837 num_items);
2838
2839 spin_lock(&meta_sinfo->lock);
2840 spin_lock(&BTRFS_I(inode)->accounting_lock);
2841 if (BTRFS_I(inode)->reserved_extents <=
2842 BTRFS_I(inode)->outstanding_extents) {
2843 spin_unlock(&BTRFS_I(inode)->accounting_lock);
2844 spin_unlock(&meta_sinfo->lock);
2845 return 0;
2846 }
2847 spin_unlock(&BTRFS_I(inode)->accounting_lock);
2848
2849 BTRFS_I(inode)->reserved_extents--;
2850 BUG_ON(BTRFS_I(inode)->reserved_extents < 0);
2851
2852 if (meta_sinfo->bytes_delalloc < num_bytes) {
2853 bug = true;
2854 meta_sinfo->bytes_delalloc = 0;
2855 } else {
2856 meta_sinfo->bytes_delalloc -= num_bytes;
2857 }
2858 spin_unlock(&meta_sinfo->lock);
2859
2860 BUG_ON(bug);
2861
2862 return 0;
2863}
2864
2865static void check_force_delalloc(struct btrfs_space_info *meta_sinfo)
2866{
2867 u64 thresh;
2868
2869 thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
2870 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
2871 meta_sinfo->bytes_super + meta_sinfo->bytes_root +
2872 meta_sinfo->bytes_may_use;
2873 2860
2874 thresh = meta_sinfo->total_bytes - thresh; 2861 if (data)
2875 thresh *= 80; 2862 flags = BTRFS_BLOCK_GROUP_DATA;
2876 do_div(thresh, 100); 2863 else if (root == root->fs_info->chunk_root)
2877 if (thresh <= meta_sinfo->bytes_delalloc) 2864 flags = BTRFS_BLOCK_GROUP_SYSTEM;
2878 meta_sinfo->force_delalloc = 1;
2879 else 2865 else
2880 meta_sinfo->force_delalloc = 0; 2866 flags = BTRFS_BLOCK_GROUP_METADATA;
2881}
2882
2883struct async_flush {
2884 struct btrfs_root *root;
2885 struct btrfs_space_info *info;
2886 struct btrfs_work work;
2887};
2888
2889static noinline void flush_delalloc_async(struct btrfs_work *work)
2890{
2891 struct async_flush *async;
2892 struct btrfs_root *root;
2893 struct btrfs_space_info *info;
2894
2895 async = container_of(work, struct async_flush, work);
2896 root = async->root;
2897 info = async->info;
2898
2899 btrfs_start_delalloc_inodes(root, 0);
2900 wake_up(&info->flush_wait);
2901 btrfs_wait_ordered_extents(root, 0, 0);
2902
2903 spin_lock(&info->lock);
2904 info->flushing = 0;
2905 spin_unlock(&info->lock);
2906 wake_up(&info->flush_wait);
2907
2908 kfree(async);
2909}
2910
2911static void wait_on_flush(struct btrfs_space_info *info)
2912{
2913 DEFINE_WAIT(wait);
2914 u64 used;
2915
2916 while (1) {
2917 prepare_to_wait(&info->flush_wait, &wait,
2918 TASK_UNINTERRUPTIBLE);
2919 spin_lock(&info->lock);
2920 if (!info->flushing) {
2921 spin_unlock(&info->lock);
2922 break;
2923 }
2924
2925 used = info->bytes_used + info->bytes_reserved +
2926 info->bytes_pinned + info->bytes_readonly +
2927 info->bytes_super + info->bytes_root +
2928 info->bytes_may_use + info->bytes_delalloc;
2929 if (used < info->total_bytes) {
2930 spin_unlock(&info->lock);
2931 break;
2932 }
2933 spin_unlock(&info->lock);
2934 schedule();
2935 }
2936 finish_wait(&info->flush_wait, &wait);
2937}
2938
2939static void flush_delalloc(struct btrfs_root *root,
2940 struct btrfs_space_info *info)
2941{
2942 struct async_flush *async;
2943 bool wait = false;
2944
2945 spin_lock(&info->lock);
2946
2947 if (!info->flushing) {
2948 info->flushing = 1;
2949 init_waitqueue_head(&info->flush_wait);
2950 } else {
2951 wait = true;
2952 }
2953
2954 spin_unlock(&info->lock);
2955
2956 if (wait) {
2957 wait_on_flush(info);
2958 return;
2959 }
2960
2961 async = kzalloc(sizeof(*async), GFP_NOFS);
2962 if (!async)
2963 goto flush;
2964
2965 async->root = root;
2966 async->info = info;
2967 async->work.func = flush_delalloc_async;
2968
2969 btrfs_queue_worker(&root->fs_info->enospc_workers,
2970 &async->work);
2971 wait_on_flush(info);
2972 return;
2973
2974flush:
2975 btrfs_start_delalloc_inodes(root, 0);
2976 btrfs_wait_ordered_extents(root, 0, 0);
2977
2978 spin_lock(&info->lock);
2979 info->flushing = 0;
2980 spin_unlock(&info->lock);
2981 wake_up(&info->flush_wait);
2982}
2983
2984static int maybe_allocate_chunk(struct btrfs_root *root,
2985 struct btrfs_space_info *info)
2986{
2987 struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
2988 struct btrfs_trans_handle *trans;
2989 bool wait = false;
2990 int ret = 0;
2991 u64 min_metadata;
2992 u64 free_space;
2993
2994 free_space = btrfs_super_total_bytes(disk_super);
2995 /*
2996 * we allow the metadata to grow to a max of either 10gb or 5% of the
2997 * space in the volume.
2998 */
2999 min_metadata = min((u64)10 * 1024 * 1024 * 1024,
3000 div64_u64(free_space * 5, 100));
3001 if (info->total_bytes >= min_metadata) {
3002 spin_unlock(&info->lock);
3003 return 0;
3004 }
3005
3006 if (info->full) {
3007 spin_unlock(&info->lock);
3008 return 0;
3009 }
3010
3011 if (!info->allocating_chunk) {
3012 info->force_alloc = 1;
3013 info->allocating_chunk = 1;
3014 init_waitqueue_head(&info->allocate_wait);
3015 } else {
3016 wait = true;
3017 }
3018
3019 spin_unlock(&info->lock);
3020
3021 if (wait) {
3022 wait_event(info->allocate_wait,
3023 !info->allocating_chunk);
3024 return 1;
3025 }
3026
3027 trans = btrfs_start_transaction(root, 1);
3028 if (!trans) {
3029 ret = -ENOMEM;
3030 goto out;
3031 }
3032
3033 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3034 4096 + 2 * 1024 * 1024,
3035 info->flags, 0);
3036 btrfs_end_transaction(trans, root);
3037 if (ret)
3038 goto out;
3039out:
3040 spin_lock(&info->lock);
3041 info->allocating_chunk = 0;
3042 spin_unlock(&info->lock);
3043 wake_up(&info->allocate_wait);
3044
3045 if (ret)
3046 return 0;
3047 return 1;
3048}
3049
3050/*
3051 * Reserve metadata space for delalloc.
3052 */
3053int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
3054 struct inode *inode, int num_items)
3055{
3056 struct btrfs_fs_info *info = root->fs_info;
3057 struct btrfs_space_info *meta_sinfo;
3058 u64 num_bytes;
3059 u64 used;
3060 u64 alloc_target;
3061 int flushed = 0;
3062 int force_delalloc;
3063
3064 /* get the space info for where the metadata will live */
3065 alloc_target = btrfs_get_alloc_profile(root, 0);
3066 meta_sinfo = __find_space_info(info, alloc_target);
3067
3068 num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
3069 num_items);
3070again:
3071 spin_lock(&meta_sinfo->lock);
3072
3073 force_delalloc = meta_sinfo->force_delalloc;
3074
3075 if (unlikely(!meta_sinfo->bytes_root))
3076 meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
3077
3078 if (!flushed)
3079 meta_sinfo->bytes_delalloc += num_bytes;
3080
3081 used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
3082 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
3083 meta_sinfo->bytes_super + meta_sinfo->bytes_root +
3084 meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
3085
3086 if (used > meta_sinfo->total_bytes) {
3087 flushed++;
3088
3089 if (flushed == 1) {
3090 if (maybe_allocate_chunk(root, meta_sinfo))
3091 goto again;
3092 flushed++;
3093 } else {
3094 spin_unlock(&meta_sinfo->lock);
3095 }
3096
3097 if (flushed == 2) {
3098 filemap_flush(inode->i_mapping);
3099 goto again;
3100 } else if (flushed == 3) {
3101 flush_delalloc(root, meta_sinfo);
3102 goto again;
3103 }
3104 spin_lock(&meta_sinfo->lock);
3105 meta_sinfo->bytes_delalloc -= num_bytes;
3106 spin_unlock(&meta_sinfo->lock);
3107 printk(KERN_ERR "enospc, has %d, reserved %d\n",
3108 BTRFS_I(inode)->outstanding_extents,
3109 BTRFS_I(inode)->reserved_extents);
3110 dump_space_info(meta_sinfo, 0, 0);
3111 return -ENOSPC;
3112 }
3113 2867
3114 BTRFS_I(inode)->reserved_extents++; 2868 return get_alloc_profile(root, flags);
3115 check_force_delalloc(meta_sinfo);
3116 spin_unlock(&meta_sinfo->lock);
3117
3118 if (!flushed && force_delalloc)
3119 filemap_flush(inode->i_mapping);
3120
3121 return 0;
3122} 2869}
3123 2870
3124/* 2871void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
3125 * unreserve num_items number of items worth of metadata space. This needs to
3126 * be paired with btrfs_reserve_metadata_space.
3127 *
3128 * NOTE: if you have the option, run this _AFTER_ you do a
3129 * btrfs_end_transaction, since btrfs_end_transaction will run delayed ref
3130 * oprations which will result in more used metadata, so we want to make sure we
3131 * can do that without issue.
3132 */
3133int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items)
3134{
3135 struct btrfs_fs_info *info = root->fs_info;
3136 struct btrfs_space_info *meta_sinfo;
3137 u64 num_bytes;
3138 u64 alloc_target;
3139 bool bug = false;
3140
3141 /* get the space info for where the metadata will live */
3142 alloc_target = btrfs_get_alloc_profile(root, 0);
3143 meta_sinfo = __find_space_info(info, alloc_target);
3144
3145 num_bytes = calculate_bytes_needed(root, num_items);
3146
3147 spin_lock(&meta_sinfo->lock);
3148 if (meta_sinfo->bytes_may_use < num_bytes) {
3149 bug = true;
3150 meta_sinfo->bytes_may_use = 0;
3151 } else {
3152 meta_sinfo->bytes_may_use -= num_bytes;
3153 }
3154 spin_unlock(&meta_sinfo->lock);
3155
3156 BUG_ON(bug);
3157
3158 return 0;
3159}
3160
3161/*
3162 * Reserve some metadata space for use. We'll calculate the worste case number
3163 * of bytes that would be needed to modify num_items number of items. If we
3164 * have space, fantastic, if not, you get -ENOSPC. Please call
3165 * btrfs_unreserve_metadata_space when you are done for the _SAME_ number of
3166 * items you reserved, since whatever metadata you needed should have already
3167 * been allocated.
3168 *
3169 * This will commit the transaction to make more space if we don't have enough
3170 * metadata space. THe only time we don't do this is if we're reserving space
3171 * inside of a transaction, then we will just return -ENOSPC and it is the
3172 * callers responsibility to handle it properly.
3173 */
3174int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items)
3175{ 2872{
3176 struct btrfs_fs_info *info = root->fs_info; 2873 BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
3177 struct btrfs_space_info *meta_sinfo; 2874 BTRFS_BLOCK_GROUP_DATA);
3178 u64 num_bytes;
3179 u64 used;
3180 u64 alloc_target;
3181 int retries = 0;
3182
3183 /* get the space info for where the metadata will live */
3184 alloc_target = btrfs_get_alloc_profile(root, 0);
3185 meta_sinfo = __find_space_info(info, alloc_target);
3186
3187 num_bytes = calculate_bytes_needed(root, num_items);
3188again:
3189 spin_lock(&meta_sinfo->lock);
3190
3191 if (unlikely(!meta_sinfo->bytes_root))
3192 meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
3193
3194 if (!retries)
3195 meta_sinfo->bytes_may_use += num_bytes;
3196
3197 used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
3198 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
3199 meta_sinfo->bytes_super + meta_sinfo->bytes_root +
3200 meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
3201
3202 if (used > meta_sinfo->total_bytes) {
3203 retries++;
3204 if (retries == 1) {
3205 if (maybe_allocate_chunk(root, meta_sinfo))
3206 goto again;
3207 retries++;
3208 } else {
3209 spin_unlock(&meta_sinfo->lock);
3210 }
3211
3212 if (retries == 2) {
3213 flush_delalloc(root, meta_sinfo);
3214 goto again;
3215 }
3216 spin_lock(&meta_sinfo->lock);
3217 meta_sinfo->bytes_may_use -= num_bytes;
3218 spin_unlock(&meta_sinfo->lock);
3219
3220 dump_space_info(meta_sinfo, 0, 0);
3221 return -ENOSPC;
3222 }
3223
3224 check_force_delalloc(meta_sinfo);
3225 spin_unlock(&meta_sinfo->lock);
3226
3227 return 0;
3228} 2875}
3229 2876
3230/* 2877/*
3231 * This will check the space that the inode allocates from to make sure we have 2878 * This will check the space that the inode allocates from to make sure we have
3232 * enough space for bytes. 2879 * enough space for bytes.
3233 */ 2880 */
3234int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, 2881int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
3235 u64 bytes)
3236{ 2882{
3237 struct btrfs_space_info *data_sinfo; 2883 struct btrfs_space_info *data_sinfo;
2884 struct btrfs_root *root = BTRFS_I(inode)->root;
2885 u64 used;
3238 int ret = 0, committed = 0; 2886 int ret = 0, committed = 0;
3239 2887
3240 /* make sure bytes are sectorsize aligned */ 2888 /* make sure bytes are sectorsize aligned */
@@ -3247,10 +2895,11 @@ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
3247again: 2895again:
3248 /* make sure we have enough space to handle the data first */ 2896 /* make sure we have enough space to handle the data first */
3249 spin_lock(&data_sinfo->lock); 2897 spin_lock(&data_sinfo->lock);
3250 if (data_sinfo->total_bytes - data_sinfo->bytes_used - 2898 used = data_sinfo->bytes_used + data_sinfo->bytes_reserved +
3251 data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved - 2899 data_sinfo->bytes_pinned + data_sinfo->bytes_readonly +
3252 data_sinfo->bytes_pinned - data_sinfo->bytes_readonly - 2900 data_sinfo->bytes_may_use;
3253 data_sinfo->bytes_may_use - data_sinfo->bytes_super < bytes) { 2901
2902 if (used + bytes > data_sinfo->total_bytes) {
3254 struct btrfs_trans_handle *trans; 2903 struct btrfs_trans_handle *trans;
3255 2904
3256 /* 2905 /*
@@ -3264,15 +2913,15 @@ again:
3264 spin_unlock(&data_sinfo->lock); 2913 spin_unlock(&data_sinfo->lock);
3265alloc: 2914alloc:
3266 alloc_target = btrfs_get_alloc_profile(root, 1); 2915 alloc_target = btrfs_get_alloc_profile(root, 1);
3267 trans = btrfs_start_transaction(root, 1); 2916 trans = btrfs_join_transaction(root, 1);
3268 if (!trans) 2917 if (IS_ERR(trans))
3269 return -ENOMEM; 2918 return PTR_ERR(trans);
3270 2919
3271 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 2920 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3272 bytes + 2 * 1024 * 1024, 2921 bytes + 2 * 1024 * 1024,
3273 alloc_target, 0); 2922 alloc_target, 0);
3274 btrfs_end_transaction(trans, root); 2923 btrfs_end_transaction(trans, root);
3275 if (ret) 2924 if (ret < 0)
3276 return ret; 2925 return ret;
3277 2926
3278 if (!data_sinfo) { 2927 if (!data_sinfo) {
@@ -3287,25 +2936,26 @@ alloc:
3287 if (!committed && !root->fs_info->open_ioctl_trans) { 2936 if (!committed && !root->fs_info->open_ioctl_trans) {
3288 committed = 1; 2937 committed = 1;
3289 trans = btrfs_join_transaction(root, 1); 2938 trans = btrfs_join_transaction(root, 1);
3290 if (!trans) 2939 if (IS_ERR(trans))
3291 return -ENOMEM; 2940 return PTR_ERR(trans);
3292 ret = btrfs_commit_transaction(trans, root); 2941 ret = btrfs_commit_transaction(trans, root);
3293 if (ret) 2942 if (ret)
3294 return ret; 2943 return ret;
3295 goto again; 2944 goto again;
3296 } 2945 }
3297 2946
3298 printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes" 2947#if 0 /* I hope we never need this code again, just in case */
3299 ", %llu bytes_used, %llu bytes_reserved, " 2948 printk(KERN_ERR "no space left, need %llu, %llu bytes_used, "
3300 "%llu bytes_pinned, %llu bytes_readonly, %llu may use " 2949 "%llu bytes_reserved, " "%llu bytes_pinned, "
3301 "%llu total\n", (unsigned long long)bytes, 2950 "%llu bytes_readonly, %llu may use %llu total\n",
3302 (unsigned long long)data_sinfo->bytes_delalloc, 2951 (unsigned long long)bytes,
3303 (unsigned long long)data_sinfo->bytes_used, 2952 (unsigned long long)data_sinfo->bytes_used,
3304 (unsigned long long)data_sinfo->bytes_reserved, 2953 (unsigned long long)data_sinfo->bytes_reserved,
3305 (unsigned long long)data_sinfo->bytes_pinned, 2954 (unsigned long long)data_sinfo->bytes_pinned,
3306 (unsigned long long)data_sinfo->bytes_readonly, 2955 (unsigned long long)data_sinfo->bytes_readonly,
3307 (unsigned long long)data_sinfo->bytes_may_use, 2956 (unsigned long long)data_sinfo->bytes_may_use,
3308 (unsigned long long)data_sinfo->total_bytes); 2957 (unsigned long long)data_sinfo->total_bytes);
2958#endif
3309 return -ENOSPC; 2959 return -ENOSPC;
3310 } 2960 }
3311 data_sinfo->bytes_may_use += bytes; 2961 data_sinfo->bytes_may_use += bytes;
@@ -3316,12 +2966,13 @@ alloc:
3316} 2966}
3317 2967
3318/* 2968/*
3319 * if there was an error for whatever reason after calling 2969 * called when we are clearing an delalloc extent from the
3320 * btrfs_check_data_free_space, call this so we can cleanup the counters. 2970 * inode's io_tree or there was an error for whatever reason
2971 * after calling btrfs_check_data_free_space
3321 */ 2972 */
3322void btrfs_free_reserved_data_space(struct btrfs_root *root, 2973void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3323 struct inode *inode, u64 bytes)
3324{ 2974{
2975 struct btrfs_root *root = BTRFS_I(inode)->root;
3325 struct btrfs_space_info *data_sinfo; 2976 struct btrfs_space_info *data_sinfo;
3326 2977
3327 /* make sure bytes are sectorsize aligned */ 2978 /* make sure bytes are sectorsize aligned */
@@ -3334,48 +2985,6 @@ void btrfs_free_reserved_data_space(struct btrfs_root *root,
3334 spin_unlock(&data_sinfo->lock); 2985 spin_unlock(&data_sinfo->lock);
3335} 2986}
3336 2987
3337/* called when we are adding a delalloc extent to the inode's io_tree */
3338void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
3339 u64 bytes)
3340{
3341 struct btrfs_space_info *data_sinfo;
3342
3343 /* get the space info for where this inode will be storing its data */
3344 data_sinfo = BTRFS_I(inode)->space_info;
3345
3346 /* make sure we have enough space to handle the data first */
3347 spin_lock(&data_sinfo->lock);
3348 data_sinfo->bytes_delalloc += bytes;
3349
3350 /*
3351 * we are adding a delalloc extent without calling
3352 * btrfs_check_data_free_space first. This happens on a weird
3353 * writepage condition, but shouldn't hurt our accounting
3354 */
3355 if (unlikely(bytes > BTRFS_I(inode)->reserved_bytes)) {
3356 data_sinfo->bytes_may_use -= BTRFS_I(inode)->reserved_bytes;
3357 BTRFS_I(inode)->reserved_bytes = 0;
3358 } else {
3359 data_sinfo->bytes_may_use -= bytes;
3360 BTRFS_I(inode)->reserved_bytes -= bytes;
3361 }
3362
3363 spin_unlock(&data_sinfo->lock);
3364}
3365
3366/* called when we are clearing an delalloc extent from the inode's io_tree */
3367void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
3368 u64 bytes)
3369{
3370 struct btrfs_space_info *info;
3371
3372 info = BTRFS_I(inode)->space_info;
3373
3374 spin_lock(&info->lock);
3375 info->bytes_delalloc -= bytes;
3376 spin_unlock(&info->lock);
3377}
3378
3379static void force_metadata_allocation(struct btrfs_fs_info *info) 2988static void force_metadata_allocation(struct btrfs_fs_info *info)
3380{ 2989{
3381 struct list_head *head = &info->space_info; 2990 struct list_head *head = &info->space_info;
@@ -3389,13 +2998,28 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
3389 rcu_read_unlock(); 2998 rcu_read_unlock();
3390} 2999}
3391 3000
3001static int should_alloc_chunk(struct btrfs_space_info *sinfo,
3002 u64 alloc_bytes)
3003{
3004 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
3005
3006 if (sinfo->bytes_used + sinfo->bytes_reserved +
3007 alloc_bytes + 256 * 1024 * 1024 < num_bytes)
3008 return 0;
3009
3010 if (sinfo->bytes_used + sinfo->bytes_reserved +
3011 alloc_bytes < div_factor(num_bytes, 8))
3012 return 0;
3013
3014 return 1;
3015}
3016
3392static int do_chunk_alloc(struct btrfs_trans_handle *trans, 3017static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3393 struct btrfs_root *extent_root, u64 alloc_bytes, 3018 struct btrfs_root *extent_root, u64 alloc_bytes,
3394 u64 flags, int force) 3019 u64 flags, int force)
3395{ 3020{
3396 struct btrfs_space_info *space_info; 3021 struct btrfs_space_info *space_info;
3397 struct btrfs_fs_info *fs_info = extent_root->fs_info; 3022 struct btrfs_fs_info *fs_info = extent_root->fs_info;
3398 u64 thresh;
3399 int ret = 0; 3023 int ret = 0;
3400 3024
3401 mutex_lock(&fs_info->chunk_mutex); 3025 mutex_lock(&fs_info->chunk_mutex);
@@ -3418,11 +3042,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3418 goto out; 3042 goto out;
3419 } 3043 }
3420 3044
3421 thresh = space_info->total_bytes - space_info->bytes_readonly; 3045 if (!force && !should_alloc_chunk(space_info, alloc_bytes)) {
3422 thresh = div_factor(thresh, 8);
3423 if (!force &&
3424 (space_info->bytes_used + space_info->bytes_pinned +
3425 space_info->bytes_reserved + alloc_bytes) < thresh) {
3426 spin_unlock(&space_info->lock); 3046 spin_unlock(&space_info->lock);
3427 goto out; 3047 goto out;
3428 } 3048 }
@@ -3444,6 +3064,8 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3444 spin_lock(&space_info->lock); 3064 spin_lock(&space_info->lock);
3445 if (ret) 3065 if (ret)
3446 space_info->full = 1; 3066 space_info->full = 1;
3067 else
3068 ret = 1;
3447 space_info->force_alloc = 0; 3069 space_info->force_alloc = 0;
3448 spin_unlock(&space_info->lock); 3070 spin_unlock(&space_info->lock);
3449out: 3071out:
@@ -3451,13 +3073,713 @@ out:
3451 return ret; 3073 return ret;
3452} 3074}
3453 3075
3076static int maybe_allocate_chunk(struct btrfs_trans_handle *trans,
3077 struct btrfs_root *root,
3078 struct btrfs_space_info *sinfo, u64 num_bytes)
3079{
3080 int ret;
3081 int end_trans = 0;
3082
3083 if (sinfo->full)
3084 return 0;
3085
3086 spin_lock(&sinfo->lock);
3087 ret = should_alloc_chunk(sinfo, num_bytes + 2 * 1024 * 1024);
3088 spin_unlock(&sinfo->lock);
3089 if (!ret)
3090 return 0;
3091
3092 if (!trans) {
3093 trans = btrfs_join_transaction(root, 1);
3094 BUG_ON(IS_ERR(trans));
3095 end_trans = 1;
3096 }
3097
3098 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3099 num_bytes + 2 * 1024 * 1024,
3100 get_alloc_profile(root, sinfo->flags), 0);
3101
3102 if (end_trans)
3103 btrfs_end_transaction(trans, root);
3104
3105 return ret == 1 ? 1 : 0;
3106}
3107
3108/*
3109 * shrink metadata reservation for delalloc
3110 */
3111static int shrink_delalloc(struct btrfs_trans_handle *trans,
3112 struct btrfs_root *root, u64 to_reclaim)
3113{
3114 struct btrfs_block_rsv *block_rsv;
3115 u64 reserved;
3116 u64 max_reclaim;
3117 u64 reclaimed = 0;
3118 int pause = 1;
3119 int ret;
3120
3121 block_rsv = &root->fs_info->delalloc_block_rsv;
3122 spin_lock(&block_rsv->lock);
3123 reserved = block_rsv->reserved;
3124 spin_unlock(&block_rsv->lock);
3125
3126 if (reserved == 0)
3127 return 0;
3128
3129 max_reclaim = min(reserved, to_reclaim);
3130
3131 while (1) {
3132 ret = btrfs_start_one_delalloc_inode(root, trans ? 1 : 0);
3133 if (!ret) {
3134 __set_current_state(TASK_INTERRUPTIBLE);
3135 schedule_timeout(pause);
3136 pause <<= 1;
3137 if (pause > HZ / 10)
3138 pause = HZ / 10;
3139 } else {
3140 pause = 1;
3141 }
3142
3143 spin_lock(&block_rsv->lock);
3144 if (reserved > block_rsv->reserved)
3145 reclaimed = reserved - block_rsv->reserved;
3146 reserved = block_rsv->reserved;
3147 spin_unlock(&block_rsv->lock);
3148
3149 if (reserved == 0 || reclaimed >= max_reclaim)
3150 break;
3151
3152 if (trans && trans->transaction->blocked)
3153 return -EAGAIN;
3154 }
3155 return reclaimed >= to_reclaim;
3156}
3157
3158static int should_retry_reserve(struct btrfs_trans_handle *trans,
3159 struct btrfs_root *root,
3160 struct btrfs_block_rsv *block_rsv,
3161 u64 num_bytes, int *retries)
3162{
3163 struct btrfs_space_info *space_info = block_rsv->space_info;
3164 int ret;
3165
3166 if ((*retries) > 2)
3167 return -ENOSPC;
3168
3169 ret = maybe_allocate_chunk(trans, root, space_info, num_bytes);
3170 if (ret)
3171 return 1;
3172
3173 if (trans && trans->transaction->in_commit)
3174 return -ENOSPC;
3175
3176 ret = shrink_delalloc(trans, root, num_bytes);
3177 if (ret)
3178 return ret;
3179
3180 spin_lock(&space_info->lock);
3181 if (space_info->bytes_pinned < num_bytes)
3182 ret = 1;
3183 spin_unlock(&space_info->lock);
3184 if (ret)
3185 return -ENOSPC;
3186
3187 (*retries)++;
3188
3189 if (trans)
3190 return -EAGAIN;
3191
3192 trans = btrfs_join_transaction(root, 1);
3193 BUG_ON(IS_ERR(trans));
3194 ret = btrfs_commit_transaction(trans, root);
3195 BUG_ON(ret);
3196
3197 return 1;
3198}
3199
3200static int reserve_metadata_bytes(struct btrfs_block_rsv *block_rsv,
3201 u64 num_bytes)
3202{
3203 struct btrfs_space_info *space_info = block_rsv->space_info;
3204 u64 unused;
3205 int ret = -ENOSPC;
3206
3207 spin_lock(&space_info->lock);
3208 unused = space_info->bytes_used + space_info->bytes_reserved +
3209 space_info->bytes_pinned + space_info->bytes_readonly;
3210
3211 if (unused < space_info->total_bytes)
3212 unused = space_info->total_bytes - unused;
3213 else
3214 unused = 0;
3215
3216 if (unused >= num_bytes) {
3217 if (block_rsv->priority >= 10) {
3218 space_info->bytes_reserved += num_bytes;
3219 ret = 0;
3220 } else {
3221 if ((unused + block_rsv->reserved) *
3222 block_rsv->priority >=
3223 (num_bytes + block_rsv->reserved) * 10) {
3224 space_info->bytes_reserved += num_bytes;
3225 ret = 0;
3226 }
3227 }
3228 }
3229 spin_unlock(&space_info->lock);
3230
3231 return ret;
3232}
3233
3234static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans,
3235 struct btrfs_root *root)
3236{
3237 struct btrfs_block_rsv *block_rsv;
3238 if (root->ref_cows)
3239 block_rsv = trans->block_rsv;
3240 else
3241 block_rsv = root->block_rsv;
3242
3243 if (!block_rsv)
3244 block_rsv = &root->fs_info->empty_block_rsv;
3245
3246 return block_rsv;
3247}
3248
3249static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
3250 u64 num_bytes)
3251{
3252 int ret = -ENOSPC;
3253 spin_lock(&block_rsv->lock);
3254 if (block_rsv->reserved >= num_bytes) {
3255 block_rsv->reserved -= num_bytes;
3256 if (block_rsv->reserved < block_rsv->size)
3257 block_rsv->full = 0;
3258 ret = 0;
3259 }
3260 spin_unlock(&block_rsv->lock);
3261 return ret;
3262}
3263
3264static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
3265 u64 num_bytes, int update_size)
3266{
3267 spin_lock(&block_rsv->lock);
3268 block_rsv->reserved += num_bytes;
3269 if (update_size)
3270 block_rsv->size += num_bytes;
3271 else if (block_rsv->reserved >= block_rsv->size)
3272 block_rsv->full = 1;
3273 spin_unlock(&block_rsv->lock);
3274}
3275
3276void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
3277 struct btrfs_block_rsv *dest, u64 num_bytes)
3278{
3279 struct btrfs_space_info *space_info = block_rsv->space_info;
3280
3281 spin_lock(&block_rsv->lock);
3282 if (num_bytes == (u64)-1)
3283 num_bytes = block_rsv->size;
3284 block_rsv->size -= num_bytes;
3285 if (block_rsv->reserved >= block_rsv->size) {
3286 num_bytes = block_rsv->reserved - block_rsv->size;
3287 block_rsv->reserved = block_rsv->size;
3288 block_rsv->full = 1;
3289 } else {
3290 num_bytes = 0;
3291 }
3292 spin_unlock(&block_rsv->lock);
3293
3294 if (num_bytes > 0) {
3295 if (dest) {
3296 block_rsv_add_bytes(dest, num_bytes, 0);
3297 } else {
3298 spin_lock(&space_info->lock);
3299 space_info->bytes_reserved -= num_bytes;
3300 spin_unlock(&space_info->lock);
3301 }
3302 }
3303}
3304
3305static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
3306 struct btrfs_block_rsv *dst, u64 num_bytes)
3307{
3308 int ret;
3309
3310 ret = block_rsv_use_bytes(src, num_bytes);
3311 if (ret)
3312 return ret;
3313
3314 block_rsv_add_bytes(dst, num_bytes, 1);
3315 return 0;
3316}
3317
3318void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)
3319{
3320 memset(rsv, 0, sizeof(*rsv));
3321 spin_lock_init(&rsv->lock);
3322 atomic_set(&rsv->usage, 1);
3323 rsv->priority = 6;
3324 INIT_LIST_HEAD(&rsv->list);
3325}
3326
3327struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
3328{
3329 struct btrfs_block_rsv *block_rsv;
3330 struct btrfs_fs_info *fs_info = root->fs_info;
3331 u64 alloc_target;
3332
3333 block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
3334 if (!block_rsv)
3335 return NULL;
3336
3337 btrfs_init_block_rsv(block_rsv);
3338
3339 alloc_target = btrfs_get_alloc_profile(root, 0);
3340 block_rsv->space_info = __find_space_info(fs_info,
3341 BTRFS_BLOCK_GROUP_METADATA);
3342
3343 return block_rsv;
3344}
3345
3346void btrfs_free_block_rsv(struct btrfs_root *root,
3347 struct btrfs_block_rsv *rsv)
3348{
3349 if (rsv && atomic_dec_and_test(&rsv->usage)) {
3350 btrfs_block_rsv_release(root, rsv, (u64)-1);
3351 if (!rsv->durable)
3352 kfree(rsv);
3353 }
3354}
3355
3356/*
3357 * make the block_rsv struct be able to capture freed space.
3358 * the captured space will re-add to the the block_rsv struct
3359 * after transaction commit
3360 */
3361void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
3362 struct btrfs_block_rsv *block_rsv)
3363{
3364 block_rsv->durable = 1;
3365 mutex_lock(&fs_info->durable_block_rsv_mutex);
3366 list_add_tail(&block_rsv->list, &fs_info->durable_block_rsv_list);
3367 mutex_unlock(&fs_info->durable_block_rsv_mutex);
3368}
3369
3370int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
3371 struct btrfs_root *root,
3372 struct btrfs_block_rsv *block_rsv,
3373 u64 num_bytes, int *retries)
3374{
3375 int ret;
3376
3377 if (num_bytes == 0)
3378 return 0;
3379again:
3380 ret = reserve_metadata_bytes(block_rsv, num_bytes);
3381 if (!ret) {
3382 block_rsv_add_bytes(block_rsv, num_bytes, 1);
3383 return 0;
3384 }
3385
3386 ret = should_retry_reserve(trans, root, block_rsv, num_bytes, retries);
3387 if (ret > 0)
3388 goto again;
3389
3390 return ret;
3391}
3392
3393int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
3394 struct btrfs_root *root,
3395 struct btrfs_block_rsv *block_rsv,
3396 u64 min_reserved, int min_factor)
3397{
3398 u64 num_bytes = 0;
3399 int commit_trans = 0;
3400 int ret = -ENOSPC;
3401
3402 if (!block_rsv)
3403 return 0;
3404
3405 spin_lock(&block_rsv->lock);
3406 if (min_factor > 0)
3407 num_bytes = div_factor(block_rsv->size, min_factor);
3408 if (min_reserved > num_bytes)
3409 num_bytes = min_reserved;
3410
3411 if (block_rsv->reserved >= num_bytes) {
3412 ret = 0;
3413 } else {
3414 num_bytes -= block_rsv->reserved;
3415 if (block_rsv->durable &&
3416 block_rsv->freed[0] + block_rsv->freed[1] >= num_bytes)
3417 commit_trans = 1;
3418 }
3419 spin_unlock(&block_rsv->lock);
3420 if (!ret)
3421 return 0;
3422
3423 if (block_rsv->refill_used) {
3424 ret = reserve_metadata_bytes(block_rsv, num_bytes);
3425 if (!ret) {
3426 block_rsv_add_bytes(block_rsv, num_bytes, 0);
3427 return 0;
3428 }
3429 }
3430
3431 if (commit_trans) {
3432 if (trans)
3433 return -EAGAIN;
3434
3435 trans = btrfs_join_transaction(root, 1);
3436 BUG_ON(IS_ERR(trans));
3437 ret = btrfs_commit_transaction(trans, root);
3438 return 0;
3439 }
3440
3441 WARN_ON(1);
3442 printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
3443 block_rsv->size, block_rsv->reserved,
3444 block_rsv->freed[0], block_rsv->freed[1]);
3445
3446 return -ENOSPC;
3447}
3448
3449int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
3450 struct btrfs_block_rsv *dst_rsv,
3451 u64 num_bytes)
3452{
3453 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
3454}
3455
3456void btrfs_block_rsv_release(struct btrfs_root *root,
3457 struct btrfs_block_rsv *block_rsv,
3458 u64 num_bytes)
3459{
3460 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
3461 if (global_rsv->full || global_rsv == block_rsv ||
3462 block_rsv->space_info != global_rsv->space_info)
3463 global_rsv = NULL;
3464 block_rsv_release_bytes(block_rsv, global_rsv, num_bytes);
3465}
3466
3467/*
3468 * helper to calculate size of global block reservation.
3469 * the desired value is sum of space used by extent tree,
3470 * checksum tree and root tree
3471 */
3472static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
3473{
3474 struct btrfs_space_info *sinfo;
3475 u64 num_bytes;
3476 u64 meta_used;
3477 u64 data_used;
3478 int csum_size = btrfs_super_csum_size(&fs_info->super_copy);
3479#if 0
3480 /*
3481 * per tree used space accounting can be inaccuracy, so we
3482 * can't rely on it.
3483 */
3484 spin_lock(&fs_info->extent_root->accounting_lock);
3485 num_bytes = btrfs_root_used(&fs_info->extent_root->root_item);
3486 spin_unlock(&fs_info->extent_root->accounting_lock);
3487
3488 spin_lock(&fs_info->csum_root->accounting_lock);
3489 num_bytes += btrfs_root_used(&fs_info->csum_root->root_item);
3490 spin_unlock(&fs_info->csum_root->accounting_lock);
3491
3492 spin_lock(&fs_info->tree_root->accounting_lock);
3493 num_bytes += btrfs_root_used(&fs_info->tree_root->root_item);
3494 spin_unlock(&fs_info->tree_root->accounting_lock);
3495#endif
3496 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
3497 spin_lock(&sinfo->lock);
3498 data_used = sinfo->bytes_used;
3499 spin_unlock(&sinfo->lock);
3500
3501 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
3502 spin_lock(&sinfo->lock);
3503 meta_used = sinfo->bytes_used;
3504 spin_unlock(&sinfo->lock);
3505
3506 num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) *
3507 csum_size * 2;
3508 num_bytes += div64_u64(data_used + meta_used, 50);
3509
3510 if (num_bytes * 3 > meta_used)
3511 num_bytes = div64_u64(meta_used, 3);
3512
3513 return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10);
3514}
3515
3516static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
3517{
3518 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
3519 struct btrfs_space_info *sinfo = block_rsv->space_info;
3520 u64 num_bytes;
3521
3522 num_bytes = calc_global_metadata_size(fs_info);
3523
3524 spin_lock(&block_rsv->lock);
3525 spin_lock(&sinfo->lock);
3526
3527 block_rsv->size = num_bytes;
3528
3529 num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
3530 sinfo->bytes_reserved + sinfo->bytes_readonly;
3531
3532 if (sinfo->total_bytes > num_bytes) {
3533 num_bytes = sinfo->total_bytes - num_bytes;
3534 block_rsv->reserved += num_bytes;
3535 sinfo->bytes_reserved += num_bytes;
3536 }
3537
3538 if (block_rsv->reserved >= block_rsv->size) {
3539 num_bytes = block_rsv->reserved - block_rsv->size;
3540 sinfo->bytes_reserved -= num_bytes;
3541 block_rsv->reserved = block_rsv->size;
3542 block_rsv->full = 1;
3543 }
3544#if 0
3545 printk(KERN_INFO"global block rsv size %llu reserved %llu\n",
3546 block_rsv->size, block_rsv->reserved);
3547#endif
3548 spin_unlock(&sinfo->lock);
3549 spin_unlock(&block_rsv->lock);
3550}
3551
3552static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
3553{
3554 struct btrfs_space_info *space_info;
3555
3556 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
3557 fs_info->chunk_block_rsv.space_info = space_info;
3558 fs_info->chunk_block_rsv.priority = 10;
3559
3560 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
3561 fs_info->global_block_rsv.space_info = space_info;
3562 fs_info->global_block_rsv.priority = 10;
3563 fs_info->global_block_rsv.refill_used = 1;
3564 fs_info->delalloc_block_rsv.space_info = space_info;
3565 fs_info->trans_block_rsv.space_info = space_info;
3566 fs_info->empty_block_rsv.space_info = space_info;
3567 fs_info->empty_block_rsv.priority = 10;
3568
3569 fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
3570 fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
3571 fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
3572 fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
3573 fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
3574
3575 btrfs_add_durable_block_rsv(fs_info, &fs_info->global_block_rsv);
3576
3577 btrfs_add_durable_block_rsv(fs_info, &fs_info->delalloc_block_rsv);
3578
3579 update_global_block_rsv(fs_info);
3580}
3581
3582static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
3583{
3584 block_rsv_release_bytes(&fs_info->global_block_rsv, NULL, (u64)-1);
3585 WARN_ON(fs_info->delalloc_block_rsv.size > 0);
3586 WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
3587 WARN_ON(fs_info->trans_block_rsv.size > 0);
3588 WARN_ON(fs_info->trans_block_rsv.reserved > 0);
3589 WARN_ON(fs_info->chunk_block_rsv.size > 0);
3590 WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
3591}
3592
3593static u64 calc_trans_metadata_size(struct btrfs_root *root, int num_items)
3594{
3595 return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
3596 3 * num_items;
3597}
3598
3599int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
3600 struct btrfs_root *root,
3601 int num_items, int *retries)
3602{
3603 u64 num_bytes;
3604 int ret;
3605
3606 if (num_items == 0 || root->fs_info->chunk_root == root)
3607 return 0;
3608
3609 num_bytes = calc_trans_metadata_size(root, num_items);
3610 ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv,
3611 num_bytes, retries);
3612 if (!ret) {
3613 trans->bytes_reserved += num_bytes;
3614 trans->block_rsv = &root->fs_info->trans_block_rsv;
3615 }
3616 return ret;
3617}
3618
3619void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
3620 struct btrfs_root *root)
3621{
3622 if (!trans->bytes_reserved)
3623 return;
3624
3625 BUG_ON(trans->block_rsv != &root->fs_info->trans_block_rsv);
3626 btrfs_block_rsv_release(root, trans->block_rsv,
3627 trans->bytes_reserved);
3628 trans->bytes_reserved = 0;
3629}
3630
3631int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
3632 struct inode *inode)
3633{
3634 struct btrfs_root *root = BTRFS_I(inode)->root;
3635 struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
3636 struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
3637
3638 /*
3639 * one for deleting orphan item, one for updating inode and
3640 * two for calling btrfs_truncate_inode_items.
3641 *
3642 * btrfs_truncate_inode_items is a delete operation, it frees
3643 * more space than it uses in most cases. So two units of
3644 * metadata space should be enough for calling it many times.
3645 * If all of the metadata space is used, we can commit
3646 * transaction and use space it freed.
3647 */
3648 u64 num_bytes = calc_trans_metadata_size(root, 4);
3649 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
3650}
3651
3652void btrfs_orphan_release_metadata(struct inode *inode)
3653{
3654 struct btrfs_root *root = BTRFS_I(inode)->root;
3655 u64 num_bytes = calc_trans_metadata_size(root, 4);
3656 btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
3657}
3658
3659int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
3660 struct btrfs_pending_snapshot *pending)
3661{
3662 struct btrfs_root *root = pending->root;
3663 struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
3664 struct btrfs_block_rsv *dst_rsv = &pending->block_rsv;
3665 /*
3666 * two for root back/forward refs, two for directory entries
3667 * and one for root of the snapshot.
3668 */
3669 u64 num_bytes = calc_trans_metadata_size(root, 5);
3670 dst_rsv->space_info = src_rsv->space_info;
3671 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
3672}
3673
3674static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes)
3675{
3676 return num_bytes >>= 3;
3677}
3678
3679int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
3680{
3681 struct btrfs_root *root = BTRFS_I(inode)->root;
3682 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
3683 u64 to_reserve;
3684 int nr_extents;
3685 int retries = 0;
3686 int ret;
3687
3688 if (btrfs_transaction_in_commit(root->fs_info))
3689 schedule_timeout(1);
3690
3691 num_bytes = ALIGN(num_bytes, root->sectorsize);
3692again:
3693 spin_lock(&BTRFS_I(inode)->accounting_lock);
3694 nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1;
3695 if (nr_extents > BTRFS_I(inode)->reserved_extents) {
3696 nr_extents -= BTRFS_I(inode)->reserved_extents;
3697 to_reserve = calc_trans_metadata_size(root, nr_extents);
3698 } else {
3699 nr_extents = 0;
3700 to_reserve = 0;
3701 }
3702
3703 to_reserve += calc_csum_metadata_size(inode, num_bytes);
3704 ret = reserve_metadata_bytes(block_rsv, to_reserve);
3705 if (ret) {
3706 spin_unlock(&BTRFS_I(inode)->accounting_lock);
3707 ret = should_retry_reserve(NULL, root, block_rsv, to_reserve,
3708 &retries);
3709 if (ret > 0)
3710 goto again;
3711 return ret;
3712 }
3713
3714 BTRFS_I(inode)->reserved_extents += nr_extents;
3715 atomic_inc(&BTRFS_I(inode)->outstanding_extents);
3716 spin_unlock(&BTRFS_I(inode)->accounting_lock);
3717
3718 block_rsv_add_bytes(block_rsv, to_reserve, 1);
3719
3720 if (block_rsv->size > 512 * 1024 * 1024)
3721 shrink_delalloc(NULL, root, to_reserve);
3722
3723 return 0;
3724}
3725
3726void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
3727{
3728 struct btrfs_root *root = BTRFS_I(inode)->root;
3729 u64 to_free;
3730 int nr_extents;
3731
3732 num_bytes = ALIGN(num_bytes, root->sectorsize);
3733 atomic_dec(&BTRFS_I(inode)->outstanding_extents);
3734
3735 spin_lock(&BTRFS_I(inode)->accounting_lock);
3736 nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents);
3737 if (nr_extents < BTRFS_I(inode)->reserved_extents) {
3738 nr_extents = BTRFS_I(inode)->reserved_extents - nr_extents;
3739 BTRFS_I(inode)->reserved_extents -= nr_extents;
3740 } else {
3741 nr_extents = 0;
3742 }
3743 spin_unlock(&BTRFS_I(inode)->accounting_lock);
3744
3745 to_free = calc_csum_metadata_size(inode, num_bytes);
3746 if (nr_extents > 0)
3747 to_free += calc_trans_metadata_size(root, nr_extents);
3748
3749 btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
3750 to_free);
3751}
3752
3753int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
3754{
3755 int ret;
3756
3757 ret = btrfs_check_data_free_space(inode, num_bytes);
3758 if (ret)
3759 return ret;
3760
3761 ret = btrfs_delalloc_reserve_metadata(inode, num_bytes);
3762 if (ret) {
3763 btrfs_free_reserved_data_space(inode, num_bytes);
3764 return ret;
3765 }
3766
3767 return 0;
3768}
3769
3770void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
3771{
3772 btrfs_delalloc_release_metadata(inode, num_bytes);
3773 btrfs_free_reserved_data_space(inode, num_bytes);
3774}
3775
3454static int update_block_group(struct btrfs_trans_handle *trans, 3776static int update_block_group(struct btrfs_trans_handle *trans,
3455 struct btrfs_root *root, 3777 struct btrfs_root *root,
3456 u64 bytenr, u64 num_bytes, int alloc, 3778 u64 bytenr, u64 num_bytes, int alloc)
3457 int mark_free)
3458{ 3779{
3459 struct btrfs_block_group_cache *cache; 3780 struct btrfs_block_group_cache *cache;
3460 struct btrfs_fs_info *info = root->fs_info; 3781 struct btrfs_fs_info *info = root->fs_info;
3782 int factor;
3461 u64 total = num_bytes; 3783 u64 total = num_bytes;
3462 u64 old_val; 3784 u64 old_val;
3463 u64 byte_in_group; 3785 u64 byte_in_group;
@@ -3476,6 +3798,12 @@ static int update_block_group(struct btrfs_trans_handle *trans,
3476 cache = btrfs_lookup_block_group(info, bytenr); 3798 cache = btrfs_lookup_block_group(info, bytenr);
3477 if (!cache) 3799 if (!cache)
3478 return -1; 3800 return -1;
3801 if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
3802 BTRFS_BLOCK_GROUP_RAID1 |
3803 BTRFS_BLOCK_GROUP_RAID10))
3804 factor = 2;
3805 else
3806 factor = 1;
3479 byte_in_group = bytenr - cache->key.objectid; 3807 byte_in_group = bytenr - cache->key.objectid;
3480 WARN_ON(byte_in_group > cache->key.offset); 3808 WARN_ON(byte_in_group > cache->key.offset);
3481 3809
@@ -3488,31 +3816,24 @@ static int update_block_group(struct btrfs_trans_handle *trans,
3488 old_val += num_bytes; 3816 old_val += num_bytes;
3489 btrfs_set_block_group_used(&cache->item, old_val); 3817 btrfs_set_block_group_used(&cache->item, old_val);
3490 cache->reserved -= num_bytes; 3818 cache->reserved -= num_bytes;
3491 cache->space_info->bytes_used += num_bytes;
3492 cache->space_info->bytes_reserved -= num_bytes; 3819 cache->space_info->bytes_reserved -= num_bytes;
3493 if (cache->ro) 3820 cache->space_info->bytes_used += num_bytes;
3494 cache->space_info->bytes_readonly -= num_bytes; 3821 cache->space_info->disk_used += num_bytes * factor;
3495 spin_unlock(&cache->lock); 3822 spin_unlock(&cache->lock);
3496 spin_unlock(&cache->space_info->lock); 3823 spin_unlock(&cache->space_info->lock);
3497 } else { 3824 } else {
3498 old_val -= num_bytes; 3825 old_val -= num_bytes;
3499 cache->space_info->bytes_used -= num_bytes;
3500 if (cache->ro)
3501 cache->space_info->bytes_readonly += num_bytes;
3502 btrfs_set_block_group_used(&cache->item, old_val); 3826 btrfs_set_block_group_used(&cache->item, old_val);
3827 cache->pinned += num_bytes;
3828 cache->space_info->bytes_pinned += num_bytes;
3829 cache->space_info->bytes_used -= num_bytes;
3830 cache->space_info->disk_used -= num_bytes * factor;
3503 spin_unlock(&cache->lock); 3831 spin_unlock(&cache->lock);
3504 spin_unlock(&cache->space_info->lock); 3832 spin_unlock(&cache->space_info->lock);
3505 if (mark_free) {
3506 int ret;
3507
3508 ret = btrfs_discard_extent(root, bytenr,
3509 num_bytes);
3510 WARN_ON(ret);
3511 3833
3512 ret = btrfs_add_free_space(cache, bytenr, 3834 set_extent_dirty(info->pinned_extents,
3513 num_bytes); 3835 bytenr, bytenr + num_bytes - 1,
3514 WARN_ON(ret); 3836 GFP_NOFS | __GFP_NOFAIL);
3515 }
3516 } 3837 }
3517 btrfs_put_block_group(cache); 3838 btrfs_put_block_group(cache);
3518 total -= num_bytes; 3839 total -= num_bytes;
@@ -3536,18 +3857,10 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
3536 return bytenr; 3857 return bytenr;
3537} 3858}
3538 3859
3539/* 3860static int pin_down_extent(struct btrfs_root *root,
3540 * this function must be called within transaction 3861 struct btrfs_block_group_cache *cache,
3541 */ 3862 u64 bytenr, u64 num_bytes, int reserved)
3542int btrfs_pin_extent(struct btrfs_root *root,
3543 u64 bytenr, u64 num_bytes, int reserved)
3544{ 3863{
3545 struct btrfs_fs_info *fs_info = root->fs_info;
3546 struct btrfs_block_group_cache *cache;
3547
3548 cache = btrfs_lookup_block_group(fs_info, bytenr);
3549 BUG_ON(!cache);
3550
3551 spin_lock(&cache->space_info->lock); 3864 spin_lock(&cache->space_info->lock);
3552 spin_lock(&cache->lock); 3865 spin_lock(&cache->lock);
3553 cache->pinned += num_bytes; 3866 cache->pinned += num_bytes;
@@ -3559,28 +3872,68 @@ int btrfs_pin_extent(struct btrfs_root *root,
3559 spin_unlock(&cache->lock); 3872 spin_unlock(&cache->lock);
3560 spin_unlock(&cache->space_info->lock); 3873 spin_unlock(&cache->space_info->lock);
3561 3874
3562 btrfs_put_block_group(cache); 3875 set_extent_dirty(root->fs_info->pinned_extents, bytenr,
3876 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
3877 return 0;
3878}
3563 3879
3564 set_extent_dirty(fs_info->pinned_extents, 3880/*
3565 bytenr, bytenr + num_bytes - 1, GFP_NOFS); 3881 * this function must be called within transaction
3882 */
3883int btrfs_pin_extent(struct btrfs_root *root,
3884 u64 bytenr, u64 num_bytes, int reserved)
3885{
3886 struct btrfs_block_group_cache *cache;
3887
3888 cache = btrfs_lookup_block_group(root->fs_info, bytenr);
3889 BUG_ON(!cache);
3890
3891 pin_down_extent(root, cache, bytenr, num_bytes, reserved);
3892
3893 btrfs_put_block_group(cache);
3566 return 0; 3894 return 0;
3567} 3895}
3568 3896
3569static int update_reserved_extents(struct btrfs_block_group_cache *cache, 3897/*
3570 u64 num_bytes, int reserve) 3898 * update size of reserved extents. this function may return -EAGAIN
3899 * if 'reserve' is true or 'sinfo' is false.
3900 */
3901static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
3902 u64 num_bytes, int reserve, int sinfo)
3571{ 3903{
3572 spin_lock(&cache->space_info->lock); 3904 int ret = 0;
3573 spin_lock(&cache->lock); 3905 if (sinfo) {
3574 if (reserve) { 3906 struct btrfs_space_info *space_info = cache->space_info;
3575 cache->reserved += num_bytes; 3907 spin_lock(&space_info->lock);
3576 cache->space_info->bytes_reserved += num_bytes; 3908 spin_lock(&cache->lock);
3909 if (reserve) {
3910 if (cache->ro) {
3911 ret = -EAGAIN;
3912 } else {
3913 cache->reserved += num_bytes;
3914 space_info->bytes_reserved += num_bytes;
3915 }
3916 } else {
3917 if (cache->ro)
3918 space_info->bytes_readonly += num_bytes;
3919 cache->reserved -= num_bytes;
3920 space_info->bytes_reserved -= num_bytes;
3921 }
3922 spin_unlock(&cache->lock);
3923 spin_unlock(&space_info->lock);
3577 } else { 3924 } else {
3578 cache->reserved -= num_bytes; 3925 spin_lock(&cache->lock);
3579 cache->space_info->bytes_reserved -= num_bytes; 3926 if (cache->ro) {
3927 ret = -EAGAIN;
3928 } else {
3929 if (reserve)
3930 cache->reserved += num_bytes;
3931 else
3932 cache->reserved -= num_bytes;
3933 }
3934 spin_unlock(&cache->lock);
3580 } 3935 }
3581 spin_unlock(&cache->lock); 3936 return ret;
3582 spin_unlock(&cache->space_info->lock);
3583 return 0;
3584} 3937}
3585 3938
3586int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, 3939int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
@@ -3611,6 +3964,8 @@ int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
3611 fs_info->pinned_extents = &fs_info->freed_extents[0]; 3964 fs_info->pinned_extents = &fs_info->freed_extents[0];
3612 3965
3613 up_write(&fs_info->extent_commit_sem); 3966 up_write(&fs_info->extent_commit_sem);
3967
3968 update_global_block_rsv(fs_info);
3614 return 0; 3969 return 0;
3615} 3970}
3616 3971
@@ -3637,14 +3992,21 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
3637 btrfs_add_free_space(cache, start, len); 3992 btrfs_add_free_space(cache, start, len);
3638 } 3993 }
3639 3994
3995 start += len;
3996
3640 spin_lock(&cache->space_info->lock); 3997 spin_lock(&cache->space_info->lock);
3641 spin_lock(&cache->lock); 3998 spin_lock(&cache->lock);
3642 cache->pinned -= len; 3999 cache->pinned -= len;
3643 cache->space_info->bytes_pinned -= len; 4000 cache->space_info->bytes_pinned -= len;
4001 if (cache->ro) {
4002 cache->space_info->bytes_readonly += len;
4003 } else if (cache->reserved_pinned > 0) {
4004 len = min(len, cache->reserved_pinned);
4005 cache->reserved_pinned -= len;
4006 cache->space_info->bytes_reserved += len;
4007 }
3644 spin_unlock(&cache->lock); 4008 spin_unlock(&cache->lock);
3645 spin_unlock(&cache->space_info->lock); 4009 spin_unlock(&cache->space_info->lock);
3646
3647 start += len;
3648 } 4010 }
3649 4011
3650 if (cache) 4012 if (cache)
@@ -3657,8 +4019,11 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
3657{ 4019{
3658 struct btrfs_fs_info *fs_info = root->fs_info; 4020 struct btrfs_fs_info *fs_info = root->fs_info;
3659 struct extent_io_tree *unpin; 4021 struct extent_io_tree *unpin;
4022 struct btrfs_block_rsv *block_rsv;
4023 struct btrfs_block_rsv *next_rsv;
3660 u64 start; 4024 u64 start;
3661 u64 end; 4025 u64 end;
4026 int idx;
3662 int ret; 4027 int ret;
3663 4028
3664 if (fs_info->pinned_extents == &fs_info->freed_extents[0]) 4029 if (fs_info->pinned_extents == &fs_info->freed_extents[0])
@@ -3679,59 +4044,30 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
3679 cond_resched(); 4044 cond_resched();
3680 } 4045 }
3681 4046
3682 return ret; 4047 mutex_lock(&fs_info->durable_block_rsv_mutex);
3683} 4048 list_for_each_entry_safe(block_rsv, next_rsv,
3684 4049 &fs_info->durable_block_rsv_list, list) {
3685static int pin_down_bytes(struct btrfs_trans_handle *trans,
3686 struct btrfs_root *root,
3687 struct btrfs_path *path,
3688 u64 bytenr, u64 num_bytes,
3689 int is_data, int reserved,
3690 struct extent_buffer **must_clean)
3691{
3692 int err = 0;
3693 struct extent_buffer *buf;
3694
3695 if (is_data)
3696 goto pinit;
3697
3698 /*
3699 * discard is sloooow, and so triggering discards on
3700 * individual btree blocks isn't a good plan. Just
3701 * pin everything in discard mode.
3702 */
3703 if (btrfs_test_opt(root, DISCARD))
3704 goto pinit;
3705 4050
3706 buf = btrfs_find_tree_block(root, bytenr, num_bytes); 4051 idx = trans->transid & 0x1;
3707 if (!buf) 4052 if (block_rsv->freed[idx] > 0) {
3708 goto pinit; 4053 block_rsv_add_bytes(block_rsv,
4054 block_rsv->freed[idx], 0);
4055 block_rsv->freed[idx] = 0;
4056 }
4057 if (atomic_read(&block_rsv->usage) == 0) {
4058 btrfs_block_rsv_release(root, block_rsv, (u64)-1);
3709 4059
3710 /* we can reuse a block if it hasn't been written 4060 if (block_rsv->freed[0] == 0 &&
3711 * and it is from this transaction. We can't 4061 block_rsv->freed[1] == 0) {
3712 * reuse anything from the tree log root because 4062 list_del_init(&block_rsv->list);
3713 * it has tiny sub-transactions. 4063 kfree(block_rsv);
3714 */ 4064 }
3715 if (btrfs_buffer_uptodate(buf, 0) && 4065 } else {
3716 btrfs_try_tree_lock(buf)) { 4066 btrfs_block_rsv_release(root, block_rsv, 0);
3717 u64 header_owner = btrfs_header_owner(buf);
3718 u64 header_transid = btrfs_header_generation(buf);
3719 if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
3720 header_transid == trans->transid &&
3721 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
3722 *must_clean = buf;
3723 return 1;
3724 } 4067 }
3725 btrfs_tree_unlock(buf);
3726 } 4068 }
3727 free_extent_buffer(buf); 4069 mutex_unlock(&fs_info->durable_block_rsv_mutex);
3728pinit:
3729 if (path)
3730 btrfs_set_path_blocking(path);
3731 /* unlocks the pinned mutex */
3732 btrfs_pin_extent(root, bytenr, num_bytes, reserved);
3733 4070
3734 BUG_ON(err < 0);
3735 return 0; 4071 return 0;
3736} 4072}
3737 4073
@@ -3892,9 +4228,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
3892 BUG_ON(ret); 4228 BUG_ON(ret);
3893 } 4229 }
3894 } else { 4230 } else {
3895 int mark_free = 0;
3896 struct extent_buffer *must_clean = NULL;
3897
3898 if (found_extent) { 4231 if (found_extent) {
3899 BUG_ON(is_data && refs_to_drop != 4232 BUG_ON(is_data && refs_to_drop !=
3900 extent_data_ref_count(root, path, iref)); 4233 extent_data_ref_count(root, path, iref));
@@ -3907,31 +4240,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
3907 } 4240 }
3908 } 4241 }
3909 4242
3910 ret = pin_down_bytes(trans, root, path, bytenr,
3911 num_bytes, is_data, 0, &must_clean);
3912 if (ret > 0)
3913 mark_free = 1;
3914 BUG_ON(ret < 0);
3915 /*
3916 * it is going to be very rare for someone to be waiting
3917 * on the block we're freeing. del_items might need to
3918 * schedule, so rather than get fancy, just force it
3919 * to blocking here
3920 */
3921 if (must_clean)
3922 btrfs_set_lock_blocking(must_clean);
3923
3924 ret = btrfs_del_items(trans, extent_root, path, path->slots[0], 4243 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
3925 num_to_del); 4244 num_to_del);
3926 BUG_ON(ret); 4245 BUG_ON(ret);
3927 btrfs_release_path(extent_root, path); 4246 btrfs_release_path(extent_root, path);
3928 4247
3929 if (must_clean) {
3930 clean_tree_block(NULL, root, must_clean);
3931 btrfs_tree_unlock(must_clean);
3932 free_extent_buffer(must_clean);
3933 }
3934
3935 if (is_data) { 4248 if (is_data) {
3936 ret = btrfs_del_csums(trans, root, bytenr, num_bytes); 4249 ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
3937 BUG_ON(ret); 4250 BUG_ON(ret);
@@ -3941,8 +4254,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
3941 (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT); 4254 (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT);
3942 } 4255 }
3943 4256
3944 ret = update_block_group(trans, root, bytenr, num_bytes, 0, 4257 ret = update_block_group(trans, root, bytenr, num_bytes, 0);
3945 mark_free);
3946 BUG_ON(ret); 4258 BUG_ON(ret);
3947 } 4259 }
3948 btrfs_free_path(path); 4260 btrfs_free_path(path);
@@ -3950,7 +4262,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
3950} 4262}
3951 4263
3952/* 4264/*
3953 * when we free an extent, it is possible (and likely) that we free the last 4265 * when we free an block, it is possible (and likely) that we free the last
3954 * delayed ref for that extent as well. This searches the delayed ref tree for 4266 * delayed ref for that extent as well. This searches the delayed ref tree for
3955 * a given extent, and if there are no other delayed refs to be processed, it 4267 * a given extent, and if there are no other delayed refs to be processed, it
3956 * removes it from the tree. 4268 * removes it from the tree.
@@ -3962,7 +4274,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
3962 struct btrfs_delayed_ref_root *delayed_refs; 4274 struct btrfs_delayed_ref_root *delayed_refs;
3963 struct btrfs_delayed_ref_node *ref; 4275 struct btrfs_delayed_ref_node *ref;
3964 struct rb_node *node; 4276 struct rb_node *node;
3965 int ret; 4277 int ret = 0;
3966 4278
3967 delayed_refs = &trans->transaction->delayed_refs; 4279 delayed_refs = &trans->transaction->delayed_refs;
3968 spin_lock(&delayed_refs->lock); 4280 spin_lock(&delayed_refs->lock);
@@ -4014,17 +4326,99 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
4014 list_del_init(&head->cluster); 4326 list_del_init(&head->cluster);
4015 spin_unlock(&delayed_refs->lock); 4327 spin_unlock(&delayed_refs->lock);
4016 4328
4017 ret = run_one_delayed_ref(trans, root->fs_info->tree_root, 4329 BUG_ON(head->extent_op);
4018 &head->node, head->extent_op, 4330 if (head->must_insert_reserved)
4019 head->must_insert_reserved); 4331 ret = 1;
4020 BUG_ON(ret); 4332
4333 mutex_unlock(&head->mutex);
4021 btrfs_put_delayed_ref(&head->node); 4334 btrfs_put_delayed_ref(&head->node);
4022 return 0; 4335 return ret;
4023out: 4336out:
4024 spin_unlock(&delayed_refs->lock); 4337 spin_unlock(&delayed_refs->lock);
4025 return 0; 4338 return 0;
4026} 4339}
4027 4340
4341void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4342 struct btrfs_root *root,
4343 struct extent_buffer *buf,
4344 u64 parent, int last_ref)
4345{
4346 struct btrfs_block_rsv *block_rsv;
4347 struct btrfs_block_group_cache *cache = NULL;
4348 int ret;
4349
4350 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
4351 ret = btrfs_add_delayed_tree_ref(trans, buf->start, buf->len,
4352 parent, root->root_key.objectid,
4353 btrfs_header_level(buf),
4354 BTRFS_DROP_DELAYED_REF, NULL);
4355 BUG_ON(ret);
4356 }
4357
4358 if (!last_ref)
4359 return;
4360
4361 block_rsv = get_block_rsv(trans, root);
4362 cache = btrfs_lookup_block_group(root->fs_info, buf->start);
4363 BUG_ON(block_rsv->space_info != cache->space_info);
4364
4365 if (btrfs_header_generation(buf) == trans->transid) {
4366 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
4367 ret = check_ref_cleanup(trans, root, buf->start);
4368 if (!ret)
4369 goto pin;
4370 }
4371
4372 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
4373 pin_down_extent(root, cache, buf->start, buf->len, 1);
4374 goto pin;
4375 }
4376
4377 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
4378
4379 btrfs_add_free_space(cache, buf->start, buf->len);
4380 ret = update_reserved_bytes(cache, buf->len, 0, 0);
4381 if (ret == -EAGAIN) {
4382 /* block group became read-only */
4383 update_reserved_bytes(cache, buf->len, 0, 1);
4384 goto out;
4385 }
4386
4387 ret = 1;
4388 spin_lock(&block_rsv->lock);
4389 if (block_rsv->reserved < block_rsv->size) {
4390 block_rsv->reserved += buf->len;
4391 ret = 0;
4392 }
4393 spin_unlock(&block_rsv->lock);
4394
4395 if (ret) {
4396 spin_lock(&cache->space_info->lock);
4397 cache->space_info->bytes_reserved -= buf->len;
4398 spin_unlock(&cache->space_info->lock);
4399 }
4400 goto out;
4401 }
4402pin:
4403 if (block_rsv->durable && !cache->ro) {
4404 ret = 0;
4405 spin_lock(&cache->lock);
4406 if (!cache->ro) {
4407 cache->reserved_pinned += buf->len;
4408 ret = 1;
4409 }
4410 spin_unlock(&cache->lock);
4411
4412 if (ret) {
4413 spin_lock(&block_rsv->lock);
4414 block_rsv->freed[trans->transid & 0x1] += buf->len;
4415 spin_unlock(&block_rsv->lock);
4416 }
4417 }
4418out:
4419 btrfs_put_block_group(cache);
4420}
4421
4028int btrfs_free_extent(struct btrfs_trans_handle *trans, 4422int btrfs_free_extent(struct btrfs_trans_handle *trans,
4029 struct btrfs_root *root, 4423 struct btrfs_root *root,
4030 u64 bytenr, u64 num_bytes, u64 parent, 4424 u64 bytenr, u64 num_bytes, u64 parent,
@@ -4046,8 +4440,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
4046 parent, root_objectid, (int)owner, 4440 parent, root_objectid, (int)owner,
4047 BTRFS_DROP_DELAYED_REF, NULL); 4441 BTRFS_DROP_DELAYED_REF, NULL);
4048 BUG_ON(ret); 4442 BUG_ON(ret);
4049 ret = check_ref_cleanup(trans, root, bytenr);
4050 BUG_ON(ret);
4051 } else { 4443 } else {
4052 ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes, 4444 ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
4053 parent, root_objectid, owner, 4445 parent, root_objectid, owner,
@@ -4057,21 +4449,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
4057 return ret; 4449 return ret;
4058} 4450}
4059 4451
4060int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4061 struct btrfs_root *root,
4062 u64 bytenr, u32 blocksize,
4063 u64 parent, u64 root_objectid, int level)
4064{
4065 u64 used;
4066 spin_lock(&root->node_lock);
4067 used = btrfs_root_used(&root->root_item) - blocksize;
4068 btrfs_set_root_used(&root->root_item, used);
4069 spin_unlock(&root->node_lock);
4070
4071 return btrfs_free_extent(trans, root, bytenr, blocksize,
4072 parent, root_objectid, level, 0);
4073}
4074
4075static u64 stripe_align(struct btrfs_root *root, u64 val) 4452static u64 stripe_align(struct btrfs_root *root, u64 val)
4076{ 4453{
4077 u64 mask = ((u64)root->stripesize - 1); 4454 u64 mask = ((u64)root->stripesize - 1);
@@ -4124,6 +4501,22 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
4124 return 0; 4501 return 0;
4125} 4502}
4126 4503
4504static int get_block_group_index(struct btrfs_block_group_cache *cache)
4505{
4506 int index;
4507 if (cache->flags & BTRFS_BLOCK_GROUP_RAID10)
4508 index = 0;
4509 else if (cache->flags & BTRFS_BLOCK_GROUP_RAID1)
4510 index = 1;
4511 else if (cache->flags & BTRFS_BLOCK_GROUP_DUP)
4512 index = 2;
4513 else if (cache->flags & BTRFS_BLOCK_GROUP_RAID0)
4514 index = 3;
4515 else
4516 index = 4;
4517 return index;
4518}
4519
4127enum btrfs_loop_type { 4520enum btrfs_loop_type {
4128 LOOP_FIND_IDEAL = 0, 4521 LOOP_FIND_IDEAL = 0,
4129 LOOP_CACHING_NOWAIT = 1, 4522 LOOP_CACHING_NOWAIT = 1,
@@ -4145,7 +4538,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
4145 u64 num_bytes, u64 empty_size, 4538 u64 num_bytes, u64 empty_size,
4146 u64 search_start, u64 search_end, 4539 u64 search_start, u64 search_end,
4147 u64 hint_byte, struct btrfs_key *ins, 4540 u64 hint_byte, struct btrfs_key *ins,
4148 u64 exclude_start, u64 exclude_nr,
4149 int data) 4541 int data)
4150{ 4542{
4151 int ret = 0; 4543 int ret = 0;
@@ -4158,6 +4550,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
4158 struct btrfs_space_info *space_info; 4550 struct btrfs_space_info *space_info;
4159 int last_ptr_loop = 0; 4551 int last_ptr_loop = 0;
4160 int loop = 0; 4552 int loop = 0;
4553 int index = 0;
4161 bool found_uncached_bg = false; 4554 bool found_uncached_bg = false;
4162 bool failed_cluster_refill = false; 4555 bool failed_cluster_refill = false;
4163 bool failed_alloc = false; 4556 bool failed_alloc = false;
@@ -4170,6 +4563,10 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
4170 ins->offset = 0; 4563 ins->offset = 0;
4171 4564
4172 space_info = __find_space_info(root->fs_info, data); 4565 space_info = __find_space_info(root->fs_info, data);
4566 if (!space_info) {
4567 printk(KERN_ERR "No space info for %d\n", data);
4568 return -ENOSPC;
4569 }
4173 4570
4174 if (orig_root->ref_cows || empty_size) 4571 if (orig_root->ref_cows || empty_size)
4175 allowed_chunk_alloc = 1; 4572 allowed_chunk_alloc = 1;
@@ -4223,6 +4620,7 @@ ideal_cache:
4223 btrfs_put_block_group(block_group); 4620 btrfs_put_block_group(block_group);
4224 up_read(&space_info->groups_sem); 4621 up_read(&space_info->groups_sem);
4225 } else { 4622 } else {
4623 index = get_block_group_index(block_group);
4226 goto have_block_group; 4624 goto have_block_group;
4227 } 4625 }
4228 } else if (block_group) { 4626 } else if (block_group) {
@@ -4231,7 +4629,8 @@ ideal_cache:
4231 } 4629 }
4232search: 4630search:
4233 down_read(&space_info->groups_sem); 4631 down_read(&space_info->groups_sem);
4234 list_for_each_entry(block_group, &space_info->block_groups, list) { 4632 list_for_each_entry(block_group, &space_info->block_groups[index],
4633 list) {
4235 u64 offset; 4634 u64 offset;
4236 int cached; 4635 int cached;
4237 4636
@@ -4422,23 +4821,22 @@ checks:
4422 goto loop; 4821 goto loop;
4423 } 4822 }
4424 4823
4425 if (exclude_nr > 0 && 4824 ins->objectid = search_start;
4426 (search_start + num_bytes > exclude_start && 4825 ins->offset = num_bytes;
4427 search_start < exclude_start + exclude_nr)) { 4826
4428 search_start = exclude_start + exclude_nr; 4827 if (offset < search_start)
4828 btrfs_add_free_space(block_group, offset,
4829 search_start - offset);
4830 BUG_ON(offset > search_start);
4429 4831
4832 ret = update_reserved_bytes(block_group, num_bytes, 1,
4833 (data & BTRFS_BLOCK_GROUP_DATA));
4834 if (ret == -EAGAIN) {
4430 btrfs_add_free_space(block_group, offset, num_bytes); 4835 btrfs_add_free_space(block_group, offset, num_bytes);
4431 /*
4432 * if search_start is still in this block group
4433 * then we just re-search this block group
4434 */
4435 if (search_start >= block_group->key.objectid &&
4436 search_start < (block_group->key.objectid +
4437 block_group->key.offset))
4438 goto have_block_group;
4439 goto loop; 4836 goto loop;
4440 } 4837 }
4441 4838
4839 /* we are all good, lets return */
4442 ins->objectid = search_start; 4840 ins->objectid = search_start;
4443 ins->offset = num_bytes; 4841 ins->offset = num_bytes;
4444 4842
@@ -4446,18 +4844,18 @@ checks:
4446 btrfs_add_free_space(block_group, offset, 4844 btrfs_add_free_space(block_group, offset,
4447 search_start - offset); 4845 search_start - offset);
4448 BUG_ON(offset > search_start); 4846 BUG_ON(offset > search_start);
4449
4450 update_reserved_extents(block_group, num_bytes, 1);
4451
4452 /* we are all good, lets return */
4453 break; 4847 break;
4454loop: 4848loop:
4455 failed_cluster_refill = false; 4849 failed_cluster_refill = false;
4456 failed_alloc = false; 4850 failed_alloc = false;
4851 BUG_ON(index != get_block_group_index(block_group));
4457 btrfs_put_block_group(block_group); 4852 btrfs_put_block_group(block_group);
4458 } 4853 }
4459 up_read(&space_info->groups_sem); 4854 up_read(&space_info->groups_sem);
4460 4855
4856 if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
4857 goto search;
4858
4461 /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for 4859 /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for
4462 * for them to make caching progress. Also 4860 * for them to make caching progress. Also
4463 * determine the best possible bg to cache 4861 * determine the best possible bg to cache
@@ -4471,6 +4869,7 @@ loop:
4471 if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE && 4869 if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE &&
4472 (found_uncached_bg || empty_size || empty_cluster || 4870 (found_uncached_bg || empty_size || empty_cluster ||
4473 allowed_chunk_alloc)) { 4871 allowed_chunk_alloc)) {
4872 index = 0;
4474 if (loop == LOOP_FIND_IDEAL && found_uncached_bg) { 4873 if (loop == LOOP_FIND_IDEAL && found_uncached_bg) {
4475 found_uncached_bg = false; 4874 found_uncached_bg = false;
4476 loop++; 4875 loop++;
@@ -4553,31 +4952,30 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
4553 int dump_block_groups) 4952 int dump_block_groups)
4554{ 4953{
4555 struct btrfs_block_group_cache *cache; 4954 struct btrfs_block_group_cache *cache;
4955 int index = 0;
4556 4956
4557 spin_lock(&info->lock); 4957 spin_lock(&info->lock);
4558 printk(KERN_INFO "space_info has %llu free, is %sfull\n", 4958 printk(KERN_INFO "space_info has %llu free, is %sfull\n",
4559 (unsigned long long)(info->total_bytes - info->bytes_used - 4959 (unsigned long long)(info->total_bytes - info->bytes_used -
4560 info->bytes_pinned - info->bytes_reserved - 4960 info->bytes_pinned - info->bytes_reserved -
4561 info->bytes_super), 4961 info->bytes_readonly),
4562 (info->full) ? "" : "not "); 4962 (info->full) ? "" : "not ");
4563 printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu," 4963 printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, "
4564 " may_use=%llu, used=%llu, root=%llu, super=%llu, reserved=%llu" 4964 "reserved=%llu, may_use=%llu, readonly=%llu\n",
4565 "\n",
4566 (unsigned long long)info->total_bytes, 4965 (unsigned long long)info->total_bytes,
4966 (unsigned long long)info->bytes_used,
4567 (unsigned long long)info->bytes_pinned, 4967 (unsigned long long)info->bytes_pinned,
4568 (unsigned long long)info->bytes_delalloc, 4968 (unsigned long long)info->bytes_reserved,
4569 (unsigned long long)info->bytes_may_use, 4969 (unsigned long long)info->bytes_may_use,
4570 (unsigned long long)info->bytes_used, 4970 (unsigned long long)info->bytes_readonly);
4571 (unsigned long long)info->bytes_root,
4572 (unsigned long long)info->bytes_super,
4573 (unsigned long long)info->bytes_reserved);
4574 spin_unlock(&info->lock); 4971 spin_unlock(&info->lock);
4575 4972
4576 if (!dump_block_groups) 4973 if (!dump_block_groups)
4577 return; 4974 return;
4578 4975
4579 down_read(&info->groups_sem); 4976 down_read(&info->groups_sem);
4580 list_for_each_entry(cache, &info->block_groups, list) { 4977again:
4978 list_for_each_entry(cache, &info->block_groups[index], list) {
4581 spin_lock(&cache->lock); 4979 spin_lock(&cache->lock);
4582 printk(KERN_INFO "block group %llu has %llu bytes, %llu used " 4980 printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
4583 "%llu pinned %llu reserved\n", 4981 "%llu pinned %llu reserved\n",
@@ -4589,6 +4987,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
4589 btrfs_dump_free_space(cache, bytes); 4987 btrfs_dump_free_space(cache, bytes);
4590 spin_unlock(&cache->lock); 4988 spin_unlock(&cache->lock);
4591 } 4989 }
4990 if (++index < BTRFS_NR_RAID_TYPES)
4991 goto again;
4592 up_read(&info->groups_sem); 4992 up_read(&info->groups_sem);
4593} 4993}
4594 4994
@@ -4614,9 +5014,8 @@ again:
4614 5014
4615 WARN_ON(num_bytes < root->sectorsize); 5015 WARN_ON(num_bytes < root->sectorsize);
4616 ret = find_free_extent(trans, root, num_bytes, empty_size, 5016 ret = find_free_extent(trans, root, num_bytes, empty_size,
4617 search_start, search_end, hint_byte, ins, 5017 search_start, search_end, hint_byte,
4618 trans->alloc_exclude_start, 5018 ins, data);
4619 trans->alloc_exclude_nr, data);
4620 5019
4621 if (ret == -ENOSPC && num_bytes > min_alloc_size) { 5020 if (ret == -ENOSPC && num_bytes > min_alloc_size) {
4622 num_bytes = num_bytes >> 1; 5021 num_bytes = num_bytes >> 1;
@@ -4654,7 +5053,7 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
4654 ret = btrfs_discard_extent(root, start, len); 5053 ret = btrfs_discard_extent(root, start, len);
4655 5054
4656 btrfs_add_free_space(cache, start, len); 5055 btrfs_add_free_space(cache, start, len);
4657 update_reserved_extents(cache, len, 0); 5056 update_reserved_bytes(cache, len, 0, 1);
4658 btrfs_put_block_group(cache); 5057 btrfs_put_block_group(cache);
4659 5058
4660 return ret; 5059 return ret;
@@ -4717,8 +5116,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
4717 btrfs_mark_buffer_dirty(path->nodes[0]); 5116 btrfs_mark_buffer_dirty(path->nodes[0]);
4718 btrfs_free_path(path); 5117 btrfs_free_path(path);
4719 5118
4720 ret = update_block_group(trans, root, ins->objectid, ins->offset, 5119 ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
4721 1, 0);
4722 if (ret) { 5120 if (ret) {
4723 printk(KERN_ERR "btrfs update block group failed for %llu " 5121 printk(KERN_ERR "btrfs update block group failed for %llu "
4724 "%llu\n", (unsigned long long)ins->objectid, 5122 "%llu\n", (unsigned long long)ins->objectid,
@@ -4778,8 +5176,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
4778 btrfs_mark_buffer_dirty(leaf); 5176 btrfs_mark_buffer_dirty(leaf);
4779 btrfs_free_path(path); 5177 btrfs_free_path(path);
4780 5178
4781 ret = update_block_group(trans, root, ins->objectid, ins->offset, 5179 ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
4782 1, 0);
4783 if (ret) { 5180 if (ret) {
4784 printk(KERN_ERR "btrfs update block group failed for %llu " 5181 printk(KERN_ERR "btrfs update block group failed for %llu "
4785 "%llu\n", (unsigned long long)ins->objectid, 5182 "%llu\n", (unsigned long long)ins->objectid,
@@ -4855,73 +5252,14 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
4855 put_caching_control(caching_ctl); 5252 put_caching_control(caching_ctl);
4856 } 5253 }
4857 5254
4858 update_reserved_extents(block_group, ins->offset, 1); 5255 ret = update_reserved_bytes(block_group, ins->offset, 1, 1);
5256 BUG_ON(ret);
4859 btrfs_put_block_group(block_group); 5257 btrfs_put_block_group(block_group);
4860 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 5258 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
4861 0, owner, offset, ins, 1); 5259 0, owner, offset, ins, 1);
4862 return ret; 5260 return ret;
4863} 5261}
4864 5262
4865/*
4866 * finds a free extent and does all the dirty work required for allocation
4867 * returns the key for the extent through ins, and a tree buffer for
4868 * the first block of the extent through buf.
4869 *
4870 * returns 0 if everything worked, non-zero otherwise.
4871 */
4872static int alloc_tree_block(struct btrfs_trans_handle *trans,
4873 struct btrfs_root *root,
4874 u64 num_bytes, u64 parent, u64 root_objectid,
4875 struct btrfs_disk_key *key, int level,
4876 u64 empty_size, u64 hint_byte, u64 search_end,
4877 struct btrfs_key *ins)
4878{
4879 int ret;
4880 u64 flags = 0;
4881
4882 ret = btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
4883 empty_size, hint_byte, search_end,
4884 ins, 0);
4885 if (ret)
4886 return ret;
4887
4888 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
4889 if (parent == 0)
4890 parent = ins->objectid;
4891 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
4892 } else
4893 BUG_ON(parent > 0);
4894
4895 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
4896 struct btrfs_delayed_extent_op *extent_op;
4897 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
4898 BUG_ON(!extent_op);
4899 if (key)
4900 memcpy(&extent_op->key, key, sizeof(extent_op->key));
4901 else
4902 memset(&extent_op->key, 0, sizeof(extent_op->key));
4903 extent_op->flags_to_set = flags;
4904 extent_op->update_key = 1;
4905 extent_op->update_flags = 1;
4906 extent_op->is_data = 0;
4907
4908 ret = btrfs_add_delayed_tree_ref(trans, ins->objectid,
4909 ins->offset, parent, root_objectid,
4910 level, BTRFS_ADD_DELAYED_EXTENT,
4911 extent_op);
4912 BUG_ON(ret);
4913 }
4914
4915 if (root_objectid == root->root_key.objectid) {
4916 u64 used;
4917 spin_lock(&root->node_lock);
4918 used = btrfs_root_used(&root->root_item) + num_bytes;
4919 btrfs_set_root_used(&root->root_item, used);
4920 spin_unlock(&root->node_lock);
4921 }
4922 return ret;
4923}
4924
4925struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, 5263struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
4926 struct btrfs_root *root, 5264 struct btrfs_root *root,
4927 u64 bytenr, u32 blocksize, 5265 u64 bytenr, u32 blocksize,
@@ -4960,8 +5298,45 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
4960 return buf; 5298 return buf;
4961} 5299}
4962 5300
5301static struct btrfs_block_rsv *
5302use_block_rsv(struct btrfs_trans_handle *trans,
5303 struct btrfs_root *root, u32 blocksize)
5304{
5305 struct btrfs_block_rsv *block_rsv;
5306 int ret;
5307
5308 block_rsv = get_block_rsv(trans, root);
5309
5310 if (block_rsv->size == 0) {
5311 ret = reserve_metadata_bytes(block_rsv, blocksize);
5312 if (ret)
5313 return ERR_PTR(ret);
5314 return block_rsv;
5315 }
5316
5317 ret = block_rsv_use_bytes(block_rsv, blocksize);
5318 if (!ret)
5319 return block_rsv;
5320
5321 WARN_ON(1);
5322 printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
5323 block_rsv->size, block_rsv->reserved,
5324 block_rsv->freed[0], block_rsv->freed[1]);
5325
5326 return ERR_PTR(-ENOSPC);
5327}
5328
5329static void unuse_block_rsv(struct btrfs_block_rsv *block_rsv, u32 blocksize)
5330{
5331 block_rsv_add_bytes(block_rsv, blocksize, 0);
5332 block_rsv_release_bytes(block_rsv, NULL, 0);
5333}
5334
4963/* 5335/*
4964 * helper function to allocate a block for a given tree 5336 * finds a free extent and does all the dirty work required for allocation
5337 * returns the key for the extent through ins, and a tree buffer for
5338 * the first block of the extent through buf.
5339 *
4965 * returns the tree buffer or NULL. 5340 * returns the tree buffer or NULL.
4966 */ 5341 */
4967struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, 5342struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
@@ -4971,18 +5346,53 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
4971 u64 hint, u64 empty_size) 5346 u64 hint, u64 empty_size)
4972{ 5347{
4973 struct btrfs_key ins; 5348 struct btrfs_key ins;
4974 int ret; 5349 struct btrfs_block_rsv *block_rsv;
4975 struct extent_buffer *buf; 5350 struct extent_buffer *buf;
5351 u64 flags = 0;
5352 int ret;
4976 5353
4977 ret = alloc_tree_block(trans, root, blocksize, parent, root_objectid, 5354
4978 key, level, empty_size, hint, (u64)-1, &ins); 5355 block_rsv = use_block_rsv(trans, root, blocksize);
5356 if (IS_ERR(block_rsv))
5357 return ERR_CAST(block_rsv);
5358
5359 ret = btrfs_reserve_extent(trans, root, blocksize, blocksize,
5360 empty_size, hint, (u64)-1, &ins, 0);
4979 if (ret) { 5361 if (ret) {
4980 BUG_ON(ret > 0); 5362 unuse_block_rsv(block_rsv, blocksize);
4981 return ERR_PTR(ret); 5363 return ERR_PTR(ret);
4982 } 5364 }
4983 5365
4984 buf = btrfs_init_new_buffer(trans, root, ins.objectid, 5366 buf = btrfs_init_new_buffer(trans, root, ins.objectid,
4985 blocksize, level); 5367 blocksize, level);
5368 BUG_ON(IS_ERR(buf));
5369
5370 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
5371 if (parent == 0)
5372 parent = ins.objectid;
5373 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
5374 } else
5375 BUG_ON(parent > 0);
5376
5377 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
5378 struct btrfs_delayed_extent_op *extent_op;
5379 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
5380 BUG_ON(!extent_op);
5381 if (key)
5382 memcpy(&extent_op->key, key, sizeof(extent_op->key));
5383 else
5384 memset(&extent_op->key, 0, sizeof(extent_op->key));
5385 extent_op->flags_to_set = flags;
5386 extent_op->update_key = 1;
5387 extent_op->update_flags = 1;
5388 extent_op->is_data = 0;
5389
5390 ret = btrfs_add_delayed_tree_ref(trans, ins.objectid,
5391 ins.offset, parent, root_objectid,
5392 level, BTRFS_ADD_DELAYED_EXTENT,
5393 extent_op);
5394 BUG_ON(ret);
5395 }
4986 return buf; 5396 return buf;
4987} 5397}
4988 5398
@@ -5205,6 +5615,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
5205 next = btrfs_find_tree_block(root, bytenr, blocksize); 5615 next = btrfs_find_tree_block(root, bytenr, blocksize);
5206 if (!next) { 5616 if (!next) {
5207 next = btrfs_find_create_tree_block(root, bytenr, blocksize); 5617 next = btrfs_find_create_tree_block(root, bytenr, blocksize);
5618 if (!next)
5619 return -ENOMEM;
5208 reada = 1; 5620 reada = 1;
5209 } 5621 }
5210 btrfs_tree_lock(next); 5622 btrfs_tree_lock(next);
@@ -5305,7 +5717,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
5305 struct btrfs_path *path, 5717 struct btrfs_path *path,
5306 struct walk_control *wc) 5718 struct walk_control *wc)
5307{ 5719{
5308 int ret = 0; 5720 int ret;
5309 int level = wc->level; 5721 int level = wc->level;
5310 struct extent_buffer *eb = path->nodes[level]; 5722 struct extent_buffer *eb = path->nodes[level];
5311 u64 parent = 0; 5723 u64 parent = 0;
@@ -5383,13 +5795,11 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
5383 btrfs_header_owner(path->nodes[level + 1])); 5795 btrfs_header_owner(path->nodes[level + 1]));
5384 } 5796 }
5385 5797
5386 ret = btrfs_free_extent(trans, root, eb->start, eb->len, parent, 5798 btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
5387 root->root_key.objectid, level, 0);
5388 BUG_ON(ret);
5389out: 5799out:
5390 wc->refs[level] = 0; 5800 wc->refs[level] = 0;
5391 wc->flags[level] = 0; 5801 wc->flags[level] = 0;
5392 return ret; 5802 return 0;
5393} 5803}
5394 5804
5395static noinline int walk_down_tree(struct btrfs_trans_handle *trans, 5805static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
@@ -5417,7 +5827,8 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
5417 if (ret > 0) { 5827 if (ret > 0) {
5418 path->slots[level]++; 5828 path->slots[level]++;
5419 continue; 5829 continue;
5420 } 5830 } else if (ret < 0)
5831 return ret;
5421 level = wc->level; 5832 level = wc->level;
5422 } 5833 }
5423 return 0; 5834 return 0;
@@ -5466,7 +5877,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
5466 * also make sure backrefs for the shared block and all lower level 5877 * also make sure backrefs for the shared block and all lower level
5467 * blocks are properly updated. 5878 * blocks are properly updated.
5468 */ 5879 */
5469int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref) 5880int btrfs_drop_snapshot(struct btrfs_root *root,
5881 struct btrfs_block_rsv *block_rsv, int update_ref)
5470{ 5882{
5471 struct btrfs_path *path; 5883 struct btrfs_path *path;
5472 struct btrfs_trans_handle *trans; 5884 struct btrfs_trans_handle *trans;
@@ -5484,7 +5896,9 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
5484 wc = kzalloc(sizeof(*wc), GFP_NOFS); 5896 wc = kzalloc(sizeof(*wc), GFP_NOFS);
5485 BUG_ON(!wc); 5897 BUG_ON(!wc);
5486 5898
5487 trans = btrfs_start_transaction(tree_root, 1); 5899 trans = btrfs_start_transaction(tree_root, 0);
5900 if (block_rsv)
5901 trans->block_rsv = block_rsv;
5488 5902
5489 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { 5903 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
5490 level = btrfs_header_level(root->node); 5904 level = btrfs_header_level(root->node);
@@ -5572,22 +5986,16 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
5572 } 5986 }
5573 5987
5574 BUG_ON(wc->level == 0); 5988 BUG_ON(wc->level == 0);
5575 if (trans->transaction->in_commit || 5989 if (btrfs_should_end_transaction(trans, tree_root)) {
5576 trans->transaction->delayed_refs.flushing) {
5577 ret = btrfs_update_root(trans, tree_root, 5990 ret = btrfs_update_root(trans, tree_root,
5578 &root->root_key, 5991 &root->root_key,
5579 root_item); 5992 root_item);
5580 BUG_ON(ret); 5993 BUG_ON(ret);
5581 5994
5582 btrfs_end_transaction(trans, tree_root); 5995 btrfs_end_transaction_throttle(trans, tree_root);
5583 trans = btrfs_start_transaction(tree_root, 1); 5996 trans = btrfs_start_transaction(tree_root, 0);
5584 } else { 5997 if (block_rsv)
5585 unsigned long update; 5998 trans->block_rsv = block_rsv;
5586 update = trans->delayed_ref_updates;
5587 trans->delayed_ref_updates = 0;
5588 if (update)
5589 btrfs_run_delayed_refs(trans, tree_root,
5590 update);
5591 } 5999 }
5592 } 6000 }
5593 btrfs_release_path(root, path); 6001 btrfs_release_path(root, path);
@@ -5615,7 +6023,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
5615 kfree(root); 6023 kfree(root);
5616 } 6024 }
5617out: 6025out:
5618 btrfs_end_transaction(trans, tree_root); 6026 btrfs_end_transaction_throttle(trans, tree_root);
5619 kfree(wc); 6027 kfree(wc);
5620 btrfs_free_path(path); 6028 btrfs_free_path(path);
5621 return err; 6029 return err;
@@ -7211,48 +7619,80 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
7211 return flags; 7619 return flags;
7212} 7620}
7213 7621
7214static int __alloc_chunk_for_shrink(struct btrfs_root *root, 7622static int set_block_group_ro(struct btrfs_block_group_cache *cache)
7215 struct btrfs_block_group_cache *shrink_block_group,
7216 int force)
7217{ 7623{
7218 struct btrfs_trans_handle *trans; 7624 struct btrfs_space_info *sinfo = cache->space_info;
7219 u64 new_alloc_flags; 7625 u64 num_bytes;
7220 u64 calc; 7626 int ret = -ENOSPC;
7221 7627
7222 spin_lock(&shrink_block_group->lock); 7628 if (cache->ro)
7223 if (btrfs_block_group_used(&shrink_block_group->item) + 7629 return 0;
7224 shrink_block_group->reserved > 0) {
7225 spin_unlock(&shrink_block_group->lock);
7226 7630
7227 trans = btrfs_start_transaction(root, 1); 7631 spin_lock(&sinfo->lock);
7228 spin_lock(&shrink_block_group->lock); 7632 spin_lock(&cache->lock);
7633 num_bytes = cache->key.offset - cache->reserved - cache->pinned -
7634 cache->bytes_super - btrfs_block_group_used(&cache->item);
7635
7636 if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
7637 sinfo->bytes_may_use + sinfo->bytes_readonly +
7638 cache->reserved_pinned + num_bytes < sinfo->total_bytes) {
7639 sinfo->bytes_readonly += num_bytes;
7640 sinfo->bytes_reserved += cache->reserved_pinned;
7641 cache->reserved_pinned = 0;
7642 cache->ro = 1;
7643 ret = 0;
7644 }
7645 spin_unlock(&cache->lock);
7646 spin_unlock(&sinfo->lock);
7647 return ret;
7648}
7229 7649
7230 new_alloc_flags = update_block_group_flags(root, 7650int btrfs_set_block_group_ro(struct btrfs_root *root,
7231 shrink_block_group->flags); 7651 struct btrfs_block_group_cache *cache)
7232 if (new_alloc_flags != shrink_block_group->flags) {
7233 calc =
7234 btrfs_block_group_used(&shrink_block_group->item);
7235 } else {
7236 calc = shrink_block_group->key.offset;
7237 }
7238 spin_unlock(&shrink_block_group->lock);
7239 7652
7240 do_chunk_alloc(trans, root->fs_info->extent_root, 7653{
7241 calc + 2 * 1024 * 1024, new_alloc_flags, force); 7654 struct btrfs_trans_handle *trans;
7655 u64 alloc_flags;
7656 int ret;
7242 7657
7243 btrfs_end_transaction(trans, root); 7658 BUG_ON(cache->ro);
7244 } else
7245 spin_unlock(&shrink_block_group->lock);
7246 return 0;
7247}
7248 7659
7660 trans = btrfs_join_transaction(root, 1);
7661 BUG_ON(IS_ERR(trans));
7249 7662
7250int btrfs_prepare_block_group_relocation(struct btrfs_root *root, 7663 alloc_flags = update_block_group_flags(root, cache->flags);
7251 struct btrfs_block_group_cache *group) 7664 if (alloc_flags != cache->flags)
7665 do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
7666
7667 ret = set_block_group_ro(cache);
7668 if (!ret)
7669 goto out;
7670 alloc_flags = get_alloc_profile(root, cache->space_info->flags);
7671 ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
7672 if (ret < 0)
7673 goto out;
7674 ret = set_block_group_ro(cache);
7675out:
7676 btrfs_end_transaction(trans, root);
7677 return ret;
7678}
7252 7679
7680int btrfs_set_block_group_rw(struct btrfs_root *root,
7681 struct btrfs_block_group_cache *cache)
7253{ 7682{
7254 __alloc_chunk_for_shrink(root, group, 1); 7683 struct btrfs_space_info *sinfo = cache->space_info;
7255 set_block_group_readonly(group); 7684 u64 num_bytes;
7685
7686 BUG_ON(!cache->ro);
7687
7688 spin_lock(&sinfo->lock);
7689 spin_lock(&cache->lock);
7690 num_bytes = cache->key.offset - cache->reserved - cache->pinned -
7691 cache->bytes_super - btrfs_block_group_used(&cache->item);
7692 sinfo->bytes_readonly -= num_bytes;
7693 cache->ro = 0;
7694 spin_unlock(&cache->lock);
7695 spin_unlock(&sinfo->lock);
7256 return 0; 7696 return 0;
7257} 7697}
7258 7698
@@ -7369,7 +7809,6 @@ static int find_first_block_group(struct btrfs_root *root,
7369 } 7809 }
7370 path->slots[0]++; 7810 path->slots[0]++;
7371 } 7811 }
7372 ret = -ENOENT;
7373out: 7812out:
7374 return ret; 7813 return ret;
7375} 7814}
@@ -7420,17 +7859,33 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
7420 */ 7859 */
7421 synchronize_rcu(); 7860 synchronize_rcu();
7422 7861
7862 release_global_block_rsv(info);
7863
7423 while(!list_empty(&info->space_info)) { 7864 while(!list_empty(&info->space_info)) {
7424 space_info = list_entry(info->space_info.next, 7865 space_info = list_entry(info->space_info.next,
7425 struct btrfs_space_info, 7866 struct btrfs_space_info,
7426 list); 7867 list);
7427 7868 if (space_info->bytes_pinned > 0 ||
7869 space_info->bytes_reserved > 0) {
7870 WARN_ON(1);
7871 dump_space_info(space_info, 0, 0);
7872 }
7428 list_del(&space_info->list); 7873 list_del(&space_info->list);
7429 kfree(space_info); 7874 kfree(space_info);
7430 } 7875 }
7431 return 0; 7876 return 0;
7432} 7877}
7433 7878
7879static void __link_block_group(struct btrfs_space_info *space_info,
7880 struct btrfs_block_group_cache *cache)
7881{
7882 int index = get_block_group_index(cache);
7883
7884 down_write(&space_info->groups_sem);
7885 list_add_tail(&cache->list, &space_info->block_groups[index]);
7886 up_write(&space_info->groups_sem);
7887}
7888
7434int btrfs_read_block_groups(struct btrfs_root *root) 7889int btrfs_read_block_groups(struct btrfs_root *root)
7435{ 7890{
7436 struct btrfs_path *path; 7891 struct btrfs_path *path;
@@ -7452,10 +7907,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7452 7907
7453 while (1) { 7908 while (1) {
7454 ret = find_first_block_group(root, path, &key); 7909 ret = find_first_block_group(root, path, &key);
7455 if (ret > 0) { 7910 if (ret > 0)
7456 ret = 0; 7911 break;
7457 goto error;
7458 }
7459 if (ret != 0) 7912 if (ret != 0)
7460 goto error; 7913 goto error;
7461 7914
@@ -7464,7 +7917,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7464 cache = kzalloc(sizeof(*cache), GFP_NOFS); 7917 cache = kzalloc(sizeof(*cache), GFP_NOFS);
7465 if (!cache) { 7918 if (!cache) {
7466 ret = -ENOMEM; 7919 ret = -ENOMEM;
7467 break; 7920 goto error;
7468 } 7921 }
7469 7922
7470 atomic_set(&cache->count, 1); 7923 atomic_set(&cache->count, 1);
@@ -7521,20 +7974,36 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7521 BUG_ON(ret); 7974 BUG_ON(ret);
7522 cache->space_info = space_info; 7975 cache->space_info = space_info;
7523 spin_lock(&cache->space_info->lock); 7976 spin_lock(&cache->space_info->lock);
7524 cache->space_info->bytes_super += cache->bytes_super; 7977 cache->space_info->bytes_readonly += cache->bytes_super;
7525 spin_unlock(&cache->space_info->lock); 7978 spin_unlock(&cache->space_info->lock);
7526 7979
7527 down_write(&space_info->groups_sem); 7980 __link_block_group(space_info, cache);
7528 list_add_tail(&cache->list, &space_info->block_groups);
7529 up_write(&space_info->groups_sem);
7530 7981
7531 ret = btrfs_add_block_group_cache(root->fs_info, cache); 7982 ret = btrfs_add_block_group_cache(root->fs_info, cache);
7532 BUG_ON(ret); 7983 BUG_ON(ret);
7533 7984
7534 set_avail_alloc_bits(root->fs_info, cache->flags); 7985 set_avail_alloc_bits(root->fs_info, cache->flags);
7535 if (btrfs_chunk_readonly(root, cache->key.objectid)) 7986 if (btrfs_chunk_readonly(root, cache->key.objectid))
7536 set_block_group_readonly(cache); 7987 set_block_group_ro(cache);
7537 } 7988 }
7989
7990 list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
7991 if (!(get_alloc_profile(root, space_info->flags) &
7992 (BTRFS_BLOCK_GROUP_RAID10 |
7993 BTRFS_BLOCK_GROUP_RAID1 |
7994 BTRFS_BLOCK_GROUP_DUP)))
7995 continue;
7996 /*
7997 * avoid allocating from un-mirrored block group if there are
7998 * mirrored block groups.
7999 */
8000 list_for_each_entry(cache, &space_info->block_groups[3], list)
8001 set_block_group_ro(cache);
8002 list_for_each_entry(cache, &space_info->block_groups[4], list)
8003 set_block_group_ro(cache);
8004 }
8005
8006 init_global_block_rsv(info);
7538 ret = 0; 8007 ret = 0;
7539error: 8008error:
7540 btrfs_free_path(path); 8009 btrfs_free_path(path);
@@ -7595,12 +8064,10 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7595 BUG_ON(ret); 8064 BUG_ON(ret);
7596 8065
7597 spin_lock(&cache->space_info->lock); 8066 spin_lock(&cache->space_info->lock);
7598 cache->space_info->bytes_super += cache->bytes_super; 8067 cache->space_info->bytes_readonly += cache->bytes_super;
7599 spin_unlock(&cache->space_info->lock); 8068 spin_unlock(&cache->space_info->lock);
7600 8069
7601 down_write(&cache->space_info->groups_sem); 8070 __link_block_group(cache->space_info, cache);
7602 list_add_tail(&cache->list, &cache->space_info->block_groups);
7603 up_write(&cache->space_info->groups_sem);
7604 8071
7605 ret = btrfs_add_block_group_cache(root->fs_info, cache); 8072 ret = btrfs_add_block_group_cache(root->fs_info, cache);
7606 BUG_ON(ret); 8073 BUG_ON(ret);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c99121ac5d6b..a4080c21ec55 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2,7 +2,6 @@
2#include <linux/slab.h> 2#include <linux/slab.h>
3#include <linux/bio.h> 3#include <linux/bio.h>
4#include <linux/mm.h> 4#include <linux/mm.h>
5#include <linux/gfp.h>
6#include <linux/pagemap.h> 5#include <linux/pagemap.h>
7#include <linux/page-flags.h> 6#include <linux/page-flags.h>
8#include <linux/module.h> 7#include <linux/module.h>
@@ -136,7 +135,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask)
136 return state; 135 return state;
137} 136}
138 137
139static void free_extent_state(struct extent_state *state) 138void free_extent_state(struct extent_state *state)
140{ 139{
141 if (!state) 140 if (!state)
142 return; 141 return;
@@ -336,21 +335,18 @@ static int merge_state(struct extent_io_tree *tree,
336} 335}
337 336
338static int set_state_cb(struct extent_io_tree *tree, 337static int set_state_cb(struct extent_io_tree *tree,
339 struct extent_state *state, 338 struct extent_state *state, int *bits)
340 unsigned long bits)
341{ 339{
342 if (tree->ops && tree->ops->set_bit_hook) { 340 if (tree->ops && tree->ops->set_bit_hook) {
343 return tree->ops->set_bit_hook(tree->mapping->host, 341 return tree->ops->set_bit_hook(tree->mapping->host,
344 state->start, state->end, 342 state, bits);
345 state->state, bits);
346 } 343 }
347 344
348 return 0; 345 return 0;
349} 346}
350 347
351static void clear_state_cb(struct extent_io_tree *tree, 348static void clear_state_cb(struct extent_io_tree *tree,
352 struct extent_state *state, 349 struct extent_state *state, int *bits)
353 unsigned long bits)
354{ 350{
355 if (tree->ops && tree->ops->clear_bit_hook) 351 if (tree->ops && tree->ops->clear_bit_hook)
356 tree->ops->clear_bit_hook(tree->mapping->host, state, bits); 352 tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
@@ -368,9 +364,10 @@ static void clear_state_cb(struct extent_io_tree *tree,
368 */ 364 */
369static int insert_state(struct extent_io_tree *tree, 365static int insert_state(struct extent_io_tree *tree,
370 struct extent_state *state, u64 start, u64 end, 366 struct extent_state *state, u64 start, u64 end,
371 int bits) 367 int *bits)
372{ 368{
373 struct rb_node *node; 369 struct rb_node *node;
370 int bits_to_set = *bits & ~EXTENT_CTLBITS;
374 int ret; 371 int ret;
375 372
376 if (end < start) { 373 if (end < start) {
@@ -385,9 +382,9 @@ static int insert_state(struct extent_io_tree *tree,
385 if (ret) 382 if (ret)
386 return ret; 383 return ret;
387 384
388 if (bits & EXTENT_DIRTY) 385 if (bits_to_set & EXTENT_DIRTY)
389 tree->dirty_bytes += end - start + 1; 386 tree->dirty_bytes += end - start + 1;
390 state->state |= bits; 387 state->state |= bits_to_set;
391 node = tree_insert(&tree->state, end, &state->rb_node); 388 node = tree_insert(&tree->state, end, &state->rb_node);
392 if (node) { 389 if (node) {
393 struct extent_state *found; 390 struct extent_state *found;
@@ -457,13 +454,13 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
457 * struct is freed and removed from the tree 454 * struct is freed and removed from the tree
458 */ 455 */
459static int clear_state_bit(struct extent_io_tree *tree, 456static int clear_state_bit(struct extent_io_tree *tree,
460 struct extent_state *state, int bits, int wake, 457 struct extent_state *state,
461 int delete) 458 int *bits, int wake)
462{ 459{
463 int bits_to_clear = bits & ~EXTENT_DO_ACCOUNTING; 460 int bits_to_clear = *bits & ~EXTENT_CTLBITS;
464 int ret = state->state & bits_to_clear; 461 int ret = state->state & bits_to_clear;
465 462
466 if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { 463 if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
467 u64 range = state->end - state->start + 1; 464 u64 range = state->end - state->start + 1;
468 WARN_ON(range > tree->dirty_bytes); 465 WARN_ON(range > tree->dirty_bytes);
469 tree->dirty_bytes -= range; 466 tree->dirty_bytes -= range;
@@ -472,9 +469,8 @@ static int clear_state_bit(struct extent_io_tree *tree,
472 state->state &= ~bits_to_clear; 469 state->state &= ~bits_to_clear;
473 if (wake) 470 if (wake)
474 wake_up(&state->wq); 471 wake_up(&state->wq);
475 if (delete || state->state == 0) { 472 if (state->state == 0) {
476 if (state->tree) { 473 if (state->tree) {
477 clear_state_cb(tree, state, state->state);
478 rb_erase(&state->rb_node, &tree->state); 474 rb_erase(&state->rb_node, &tree->state);
479 state->tree = NULL; 475 state->tree = NULL;
480 free_extent_state(state); 476 free_extent_state(state);
@@ -515,6 +511,10 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
515 int set = 0; 511 int set = 0;
516 int clear = 0; 512 int clear = 0;
517 513
514 if (delete)
515 bits |= ~EXTENT_CTLBITS;
516 bits |= EXTENT_FIRST_DELALLOC;
517
518 if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY)) 518 if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
519 clear = 1; 519 clear = 1;
520again: 520again:
@@ -581,8 +581,7 @@ hit_next:
581 if (err) 581 if (err)
582 goto out; 582 goto out;
583 if (state->end <= end) { 583 if (state->end <= end) {
584 set |= clear_state_bit(tree, state, bits, wake, 584 set |= clear_state_bit(tree, state, &bits, wake);
585 delete);
586 if (last_end == (u64)-1) 585 if (last_end == (u64)-1)
587 goto out; 586 goto out;
588 start = last_end + 1; 587 start = last_end + 1;
@@ -603,7 +602,7 @@ hit_next:
603 if (wake) 602 if (wake)
604 wake_up(&state->wq); 603 wake_up(&state->wq);
605 604
606 set |= clear_state_bit(tree, prealloc, bits, wake, delete); 605 set |= clear_state_bit(tree, prealloc, &bits, wake);
607 606
608 prealloc = NULL; 607 prealloc = NULL;
609 goto out; 608 goto out;
@@ -614,7 +613,7 @@ hit_next:
614 else 613 else
615 next_node = NULL; 614 next_node = NULL;
616 615
617 set |= clear_state_bit(tree, state, bits, wake, delete); 616 set |= clear_state_bit(tree, state, &bits, wake);
618 if (last_end == (u64)-1) 617 if (last_end == (u64)-1)
619 goto out; 618 goto out;
620 start = last_end + 1; 619 start = last_end + 1;
@@ -707,19 +706,19 @@ out:
707 706
708static int set_state_bits(struct extent_io_tree *tree, 707static int set_state_bits(struct extent_io_tree *tree,
709 struct extent_state *state, 708 struct extent_state *state,
710 int bits) 709 int *bits)
711{ 710{
712 int ret; 711 int ret;
712 int bits_to_set = *bits & ~EXTENT_CTLBITS;
713 713
714 ret = set_state_cb(tree, state, bits); 714 ret = set_state_cb(tree, state, bits);
715 if (ret) 715 if (ret)
716 return ret; 716 return ret;
717 717 if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
718 if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
719 u64 range = state->end - state->start + 1; 718 u64 range = state->end - state->start + 1;
720 tree->dirty_bytes += range; 719 tree->dirty_bytes += range;
721 } 720 }
722 state->state |= bits; 721 state->state |= bits_to_set;
723 722
724 return 0; 723 return 0;
725} 724}
@@ -746,10 +745,9 @@ static void cache_state(struct extent_state *state,
746 * [start, end] is inclusive This takes the tree lock. 745 * [start, end] is inclusive This takes the tree lock.
747 */ 746 */
748 747
749static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 748int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
750 int bits, int exclusive_bits, u64 *failed_start, 749 int bits, int exclusive_bits, u64 *failed_start,
751 struct extent_state **cached_state, 750 struct extent_state **cached_state, gfp_t mask)
752 gfp_t mask)
753{ 751{
754 struct extent_state *state; 752 struct extent_state *state;
755 struct extent_state *prealloc = NULL; 753 struct extent_state *prealloc = NULL;
@@ -758,6 +756,7 @@ static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
758 u64 last_start; 756 u64 last_start;
759 u64 last_end; 757 u64 last_end;
760 758
759 bits |= EXTENT_FIRST_DELALLOC;
761again: 760again:
762 if (!prealloc && (mask & __GFP_WAIT)) { 761 if (!prealloc && (mask & __GFP_WAIT)) {
763 prealloc = alloc_extent_state(mask); 762 prealloc = alloc_extent_state(mask);
@@ -779,7 +778,7 @@ again:
779 */ 778 */
780 node = tree_search(tree, start); 779 node = tree_search(tree, start);
781 if (!node) { 780 if (!node) {
782 err = insert_state(tree, prealloc, start, end, bits); 781 err = insert_state(tree, prealloc, start, end, &bits);
783 prealloc = NULL; 782 prealloc = NULL;
784 BUG_ON(err == -EEXIST); 783 BUG_ON(err == -EEXIST);
785 goto out; 784 goto out;
@@ -803,7 +802,7 @@ hit_next:
803 goto out; 802 goto out;
804 } 803 }
805 804
806 err = set_state_bits(tree, state, bits); 805 err = set_state_bits(tree, state, &bits);
807 if (err) 806 if (err)
808 goto out; 807 goto out;
809 808
@@ -853,7 +852,7 @@ hit_next:
853 if (err) 852 if (err)
854 goto out; 853 goto out;
855 if (state->end <= end) { 854 if (state->end <= end) {
856 err = set_state_bits(tree, state, bits); 855 err = set_state_bits(tree, state, &bits);
857 if (err) 856 if (err)
858 goto out; 857 goto out;
859 cache_state(state, cached_state); 858 cache_state(state, cached_state);
@@ -878,7 +877,7 @@ hit_next:
878 else 877 else
879 this_end = last_start - 1; 878 this_end = last_start - 1;
880 err = insert_state(tree, prealloc, start, this_end, 879 err = insert_state(tree, prealloc, start, this_end,
881 bits); 880 &bits);
882 BUG_ON(err == -EEXIST); 881 BUG_ON(err == -EEXIST);
883 if (err) { 882 if (err) {
884 prealloc = NULL; 883 prealloc = NULL;
@@ -904,7 +903,7 @@ hit_next:
904 err = split_state(tree, state, prealloc, end + 1); 903 err = split_state(tree, state, prealloc, end + 1);
905 BUG_ON(err == -EEXIST); 904 BUG_ON(err == -EEXIST);
906 905
907 err = set_state_bits(tree, prealloc, bits); 906 err = set_state_bits(tree, prealloc, &bits);
908 if (err) { 907 if (err) {
909 prealloc = NULL; 908 prealloc = NULL;
910 goto out; 909 goto out;
@@ -967,8 +966,7 @@ int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
967{ 966{
968 return clear_extent_bit(tree, start, end, 967 return clear_extent_bit(tree, start, end,
969 EXTENT_DIRTY | EXTENT_DELALLOC | 968 EXTENT_DIRTY | EXTENT_DELALLOC |
970 EXTENT_DO_ACCOUNTING, 0, 0, 969 EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask);
971 NULL, mask);
972} 970}
973 971
974int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, 972int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
@@ -1436,9 +1434,6 @@ int extent_clear_unlock_delalloc(struct inode *inode,
1436 if (op & EXTENT_CLEAR_DELALLOC) 1434 if (op & EXTENT_CLEAR_DELALLOC)
1437 clear_bits |= EXTENT_DELALLOC; 1435 clear_bits |= EXTENT_DELALLOC;
1438 1436
1439 if (op & EXTENT_CLEAR_ACCOUNTING)
1440 clear_bits |= EXTENT_DO_ACCOUNTING;
1441
1442 clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS); 1437 clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
1443 if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | 1438 if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
1444 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK | 1439 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK |
@@ -1917,7 +1912,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
1917 1912
1918 if (tree->ops && tree->ops->submit_bio_hook) 1913 if (tree->ops && tree->ops->submit_bio_hook)
1919 tree->ops->submit_bio_hook(page->mapping->host, rw, bio, 1914 tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
1920 mirror_num, bio_flags); 1915 mirror_num, bio_flags, start);
1921 else 1916 else
1922 submit_bio(rw, bio); 1917 submit_bio(rw, bio);
1923 if (bio_flagged(bio, BIO_EOPNOTSUPP)) 1918 if (bio_flagged(bio, BIO_EOPNOTSUPP))
@@ -2021,6 +2016,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2021 sector_t sector; 2016 sector_t sector;
2022 struct extent_map *em; 2017 struct extent_map *em;
2023 struct block_device *bdev; 2018 struct block_device *bdev;
2019 struct btrfs_ordered_extent *ordered;
2024 int ret; 2020 int ret;
2025 int nr = 0; 2021 int nr = 0;
2026 size_t page_offset = 0; 2022 size_t page_offset = 0;
@@ -2032,7 +2028,15 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2032 set_page_extent_mapped(page); 2028 set_page_extent_mapped(page);
2033 2029
2034 end = page_end; 2030 end = page_end;
2035 lock_extent(tree, start, end, GFP_NOFS); 2031 while (1) {
2032 lock_extent(tree, start, end, GFP_NOFS);
2033 ordered = btrfs_lookup_ordered_extent(inode, start);
2034 if (!ordered)
2035 break;
2036 unlock_extent(tree, start, end, GFP_NOFS);
2037 btrfs_start_ordered_extent(inode, ordered, 1);
2038 btrfs_put_ordered_extent(ordered);
2039 }
2036 2040
2037 if (page->index == last_byte >> PAGE_CACHE_SHIFT) { 2041 if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
2038 char *userpage; 2042 char *userpage;
@@ -2679,33 +2683,20 @@ int extent_readpages(struct extent_io_tree *tree,
2679{ 2683{
2680 struct bio *bio = NULL; 2684 struct bio *bio = NULL;
2681 unsigned page_idx; 2685 unsigned page_idx;
2682 struct pagevec pvec;
2683 unsigned long bio_flags = 0; 2686 unsigned long bio_flags = 0;
2684 2687
2685 pagevec_init(&pvec, 0);
2686 for (page_idx = 0; page_idx < nr_pages; page_idx++) { 2688 for (page_idx = 0; page_idx < nr_pages; page_idx++) {
2687 struct page *page = list_entry(pages->prev, struct page, lru); 2689 struct page *page = list_entry(pages->prev, struct page, lru);
2688 2690
2689 prefetchw(&page->flags); 2691 prefetchw(&page->flags);
2690 list_del(&page->lru); 2692 list_del(&page->lru);
2691 /* 2693 if (!add_to_page_cache_lru(page, mapping,
2692 * what we want to do here is call add_to_page_cache_lru,
2693 * but that isn't exported, so we reproduce it here
2694 */
2695 if (!add_to_page_cache(page, mapping,
2696 page->index, GFP_KERNEL)) { 2694 page->index, GFP_KERNEL)) {
2697
2698 /* open coding of lru_cache_add, also not exported */
2699 page_cache_get(page);
2700 if (!pagevec_add(&pvec, page))
2701 __pagevec_lru_add_file(&pvec);
2702 __extent_read_full_page(tree, page, get_extent, 2695 __extent_read_full_page(tree, page, get_extent,
2703 &bio, 0, &bio_flags); 2696 &bio, 0, &bio_flags);
2704 } 2697 }
2705 page_cache_release(page); 2698 page_cache_release(page);
2706 } 2699 }
2707 if (pagevec_count(&pvec))
2708 __pagevec_lru_add_file(&pvec);
2709 BUG_ON(!list_empty(pages)); 2700 BUG_ON(!list_empty(pages));
2710 if (bio) 2701 if (bio)
2711 submit_one_bio(READ, bio, 0, bio_flags); 2702 submit_one_bio(READ, bio, 0, bio_flags);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index bbab4813646f..5691c7b590da 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -16,7 +16,9 @@
16#define EXTENT_BOUNDARY (1 << 9) 16#define EXTENT_BOUNDARY (1 << 9)
17#define EXTENT_NODATASUM (1 << 10) 17#define EXTENT_NODATASUM (1 << 10)
18#define EXTENT_DO_ACCOUNTING (1 << 11) 18#define EXTENT_DO_ACCOUNTING (1 << 11)
19#define EXTENT_FIRST_DELALLOC (1 << 12)
19#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) 20#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
21#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
20 22
21/* flags for bio submission */ 23/* flags for bio submission */
22#define EXTENT_BIO_COMPRESSED 1 24#define EXTENT_BIO_COMPRESSED 1
@@ -47,7 +49,7 @@ struct extent_state;
47 49
48typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw, 50typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw,
49 struct bio *bio, int mirror_num, 51 struct bio *bio, int mirror_num,
50 unsigned long bio_flags); 52 unsigned long bio_flags, u64 bio_offset);
51struct extent_io_ops { 53struct extent_io_ops {
52 int (*fill_delalloc)(struct inode *inode, struct page *locked_page, 54 int (*fill_delalloc)(struct inode *inode, struct page *locked_page,
53 u64 start, u64 end, int *page_started, 55 u64 start, u64 end, int *page_started,
@@ -69,10 +71,10 @@ struct extent_io_ops {
69 struct extent_state *state); 71 struct extent_state *state);
70 int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, 72 int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
71 struct extent_state *state, int uptodate); 73 struct extent_state *state, int uptodate);
72 int (*set_bit_hook)(struct inode *inode, u64 start, u64 end, 74 int (*set_bit_hook)(struct inode *inode, struct extent_state *state,
73 unsigned long old, unsigned long bits); 75 int *bits);
74 int (*clear_bit_hook)(struct inode *inode, struct extent_state *state, 76 int (*clear_bit_hook)(struct inode *inode, struct extent_state *state,
75 unsigned long bits); 77 int *bits);
76 int (*merge_extent_hook)(struct inode *inode, 78 int (*merge_extent_hook)(struct inode *inode,
77 struct extent_state *new, 79 struct extent_state *new,
78 struct extent_state *other); 80 struct extent_state *other);
@@ -176,6 +178,7 @@ u64 count_range_bits(struct extent_io_tree *tree,
176 u64 *start, u64 search_end, 178 u64 *start, u64 search_end,
177 u64 max_bytes, unsigned long bits); 179 u64 max_bytes, unsigned long bits);
178 180
181void free_extent_state(struct extent_state *state);
179int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, 182int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
180 int bits, int filled, struct extent_state *cached_state); 183 int bits, int filled, struct extent_state *cached_state);
181int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 184int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
@@ -185,6 +188,9 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
185 gfp_t mask); 188 gfp_t mask);
186int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 189int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
187 int bits, gfp_t mask); 190 int bits, gfp_t mask);
191int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
192 int bits, int exclusive_bits, u64 *failed_start,
193 struct extent_state **cached_state, gfp_t mask);
188int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, 194int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
189 gfp_t mask); 195 gfp_t mask);
190int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, 196int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 28d87ba60ce8..454ca52d6451 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1,5 +1,4 @@
1#include <linux/err.h> 1#include <linux/err.h>
2#include <linux/gfp.h>
3#include <linux/slab.h> 2#include <linux/slab.h>
4#include <linux/module.h> 3#include <linux/module.h>
5#include <linux/spinlock.h> 4#include <linux/spinlock.h>
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 9b99886562d0..a562a250ae77 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/bio.h> 19#include <linux/bio.h>
20#include <linux/slab.h>
20#include <linux/pagemap.h> 21#include <linux/pagemap.h>
21#include <linux/highmem.h> 22#include <linux/highmem.h>
22#include "ctree.h" 23#include "ctree.h"
@@ -148,13 +149,14 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
148} 149}
149 150
150 151
151int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, 152static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
152 struct bio *bio, u32 *dst) 153 struct inode *inode, struct bio *bio,
154 u64 logical_offset, u32 *dst, int dio)
153{ 155{
154 u32 sum; 156 u32 sum;
155 struct bio_vec *bvec = bio->bi_io_vec; 157 struct bio_vec *bvec = bio->bi_io_vec;
156 int bio_index = 0; 158 int bio_index = 0;
157 u64 offset; 159 u64 offset = 0;
158 u64 item_start_offset = 0; 160 u64 item_start_offset = 0;
159 u64 item_last_offset = 0; 161 u64 item_last_offset = 0;
160 u64 disk_bytenr; 162 u64 disk_bytenr;
@@ -173,8 +175,11 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
173 WARN_ON(bio->bi_vcnt <= 0); 175 WARN_ON(bio->bi_vcnt <= 0);
174 176
175 disk_bytenr = (u64)bio->bi_sector << 9; 177 disk_bytenr = (u64)bio->bi_sector << 9;
178 if (dio)
179 offset = logical_offset;
176 while (bio_index < bio->bi_vcnt) { 180 while (bio_index < bio->bi_vcnt) {
177 offset = page_offset(bvec->bv_page) + bvec->bv_offset; 181 if (!dio)
182 offset = page_offset(bvec->bv_page) + bvec->bv_offset;
178 ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum); 183 ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum);
179 if (ret == 0) 184 if (ret == 0)
180 goto found; 185 goto found;
@@ -237,6 +242,7 @@ found:
237 else 242 else
238 set_state_private(io_tree, offset, sum); 243 set_state_private(io_tree, offset, sum);
239 disk_bytenr += bvec->bv_len; 244 disk_bytenr += bvec->bv_len;
245 offset += bvec->bv_len;
240 bio_index++; 246 bio_index++;
241 bvec++; 247 bvec++;
242 } 248 }
@@ -244,6 +250,18 @@ found:
244 return 0; 250 return 0;
245} 251}
246 252
253int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
254 struct bio *bio, u32 *dst)
255{
256 return __btrfs_lookup_bio_sums(root, inode, bio, 0, dst, 0);
257}
258
259int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
260 struct bio *bio, u64 offset, u32 *dst)
261{
262 return __btrfs_lookup_bio_sums(root, inode, bio, offset, dst, 1);
263}
264
247int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, 265int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
248 struct list_head *list) 266 struct list_head *list)
249{ 267{
@@ -656,6 +674,9 @@ again:
656 goto found; 674 goto found;
657 } 675 }
658 ret = PTR_ERR(item); 676 ret = PTR_ERR(item);
677 if (ret != -EFBIG && ret != -ENOENT)
678 goto fail_unlock;
679
659 if (ret == -EFBIG) { 680 if (ret == -EFBIG) {
660 u32 item_size; 681 u32 item_size;
661 /* we found one, but it isn't big enough yet */ 682 /* we found one, but it isn't big enough yet */
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index ee3323c7fc1c..787b50a16a14 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -28,6 +28,7 @@
28#include <linux/writeback.h> 28#include <linux/writeback.h>
29#include <linux/statfs.h> 29#include <linux/statfs.h>
30#include <linux/compat.h> 30#include <linux/compat.h>
31#include <linux/slab.h>
31#include "ctree.h" 32#include "ctree.h"
32#include "disk-io.h" 33#include "disk-io.h"
33#include "transaction.h" 34#include "transaction.h"
@@ -45,32 +46,42 @@
45static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, 46static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
46 int write_bytes, 47 int write_bytes,
47 struct page **prepared_pages, 48 struct page **prepared_pages,
48 const char __user *buf) 49 struct iov_iter *i)
49{ 50{
50 long page_fault = 0; 51 size_t copied;
51 int i; 52 int pg = 0;
52 int offset = pos & (PAGE_CACHE_SIZE - 1); 53 int offset = pos & (PAGE_CACHE_SIZE - 1);
53 54
54 for (i = 0; i < num_pages && write_bytes > 0; i++, offset = 0) { 55 while (write_bytes > 0) {
55 size_t count = min_t(size_t, 56 size_t count = min_t(size_t,
56 PAGE_CACHE_SIZE - offset, write_bytes); 57 PAGE_CACHE_SIZE - offset, write_bytes);
57 struct page *page = prepared_pages[i]; 58 struct page *page = prepared_pages[pg];
58 fault_in_pages_readable(buf, count); 59again:
60 if (unlikely(iov_iter_fault_in_readable(i, count)))
61 return -EFAULT;
59 62
60 /* Copy data from userspace to the current page */ 63 /* Copy data from userspace to the current page */
61 kmap(page); 64 copied = iov_iter_copy_from_user(page, i, offset, count);
62 page_fault = __copy_from_user(page_address(page) + offset, 65
63 buf, count);
64 /* Flush processor's dcache for this page */ 66 /* Flush processor's dcache for this page */
65 flush_dcache_page(page); 67 flush_dcache_page(page);
66 kunmap(page); 68 iov_iter_advance(i, copied);
67 buf += count; 69 write_bytes -= copied;
68 write_bytes -= count;
69 70
70 if (page_fault) 71 if (unlikely(copied == 0)) {
71 break; 72 count = min_t(size_t, PAGE_CACHE_SIZE - offset,
73 iov_iter_single_seg_count(i));
74 goto again;
75 }
76
77 if (unlikely(copied < PAGE_CACHE_SIZE - offset)) {
78 offset += copied;
79 } else {
80 pg++;
81 offset = 0;
82 }
72 } 83 }
73 return page_fault ? -EFAULT : 0; 84 return 0;
74} 85}
75 86
76/* 87/*
@@ -125,8 +136,7 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
125 end_of_last_block = start_pos + num_bytes - 1; 136 end_of_last_block = start_pos + num_bytes - 1;
126 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, 137 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
127 NULL); 138 NULL);
128 if (err) 139 BUG_ON(err);
129 return err;
130 140
131 for (i = 0; i < num_pages; i++) { 141 for (i = 0; i < num_pages; i++) {
132 struct page *p = pages[i]; 142 struct page *p = pages[i];
@@ -141,7 +151,7 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
141 * at this time. 151 * at this time.
142 */ 152 */
143 } 153 }
144 return err; 154 return 0;
145} 155}
146 156
147/* 157/*
@@ -822,45 +832,46 @@ again:
822 return 0; 832 return 0;
823} 833}
824 834
825static ssize_t btrfs_file_write(struct file *file, const char __user *buf, 835static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
826 size_t count, loff_t *ppos) 836 const struct iovec *iov,
837 unsigned long nr_segs, loff_t pos)
827{ 838{
828 loff_t pos; 839 struct file *file = iocb->ki_filp;
840 struct inode *inode = fdentry(file)->d_inode;
841 struct btrfs_root *root = BTRFS_I(inode)->root;
842 struct page *pinned[2];
843 struct page **pages = NULL;
844 struct iov_iter i;
845 loff_t *ppos = &iocb->ki_pos;
829 loff_t start_pos; 846 loff_t start_pos;
830 ssize_t num_written = 0; 847 ssize_t num_written = 0;
831 ssize_t err = 0; 848 ssize_t err = 0;
849 size_t count;
850 size_t ocount;
832 int ret = 0; 851 int ret = 0;
833 struct inode *inode = fdentry(file)->d_inode;
834 struct btrfs_root *root = BTRFS_I(inode)->root;
835 struct page **pages = NULL;
836 int nrptrs; 852 int nrptrs;
837 struct page *pinned[2];
838 unsigned long first_index; 853 unsigned long first_index;
839 unsigned long last_index; 854 unsigned long last_index;
840 int will_write; 855 int will_write;
856 int buffered = 0;
841 857
842 will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) || 858 will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) ||
843 (file->f_flags & O_DIRECT)); 859 (file->f_flags & O_DIRECT));
844 860
845 nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE,
846 PAGE_CACHE_SIZE / (sizeof(struct page *)));
847 pinned[0] = NULL; 861 pinned[0] = NULL;
848 pinned[1] = NULL; 862 pinned[1] = NULL;
849 863
850 pos = *ppos;
851 start_pos = pos; 864 start_pos = pos;
852 865
853 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); 866 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
854 867
855 /* do the reserve before the mutex lock in case we have to do some
856 * flushing. We wouldn't deadlock, but this is more polite.
857 */
858 err = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
859 if (err)
860 goto out_nolock;
861
862 mutex_lock(&inode->i_mutex); 868 mutex_lock(&inode->i_mutex);
863 869
870 err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
871 if (err)
872 goto out;
873 count = ocount;
874
864 current->backing_dev_info = inode->i_mapping->backing_dev_info; 875 current->backing_dev_info = inode->i_mapping->backing_dev_info;
865 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); 876 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
866 if (err) 877 if (err)
@@ -874,15 +885,53 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
874 goto out; 885 goto out;
875 886
876 file_update_time(file); 887 file_update_time(file);
888 BTRFS_I(inode)->sequence++;
889
890 if (unlikely(file->f_flags & O_DIRECT)) {
891 num_written = generic_file_direct_write(iocb, iov, &nr_segs,
892 pos, ppos, count,
893 ocount);
894 /*
895 * the generic O_DIRECT will update in-memory i_size after the
896 * DIOs are done. But our endio handlers that update the on
897 * disk i_size never update past the in memory i_size. So we
898 * need one more update here to catch any additions to the
899 * file
900 */
901 if (inode->i_size != BTRFS_I(inode)->disk_i_size) {
902 btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
903 mark_inode_dirty(inode);
904 }
877 905
906 if (num_written < 0) {
907 ret = num_written;
908 num_written = 0;
909 goto out;
910 } else if (num_written == count) {
911 /* pick up pos changes done by the generic code */
912 pos = *ppos;
913 goto out;
914 }
915 /*
916 * We are going to do buffered for the rest of the range, so we
917 * need to make sure to invalidate the buffered pages when we're
918 * done.
919 */
920 buffered = 1;
921 pos += num_written;
922 }
923
924 iov_iter_init(&i, iov, nr_segs, count, num_written);
925 nrptrs = min((iov_iter_count(&i) + PAGE_CACHE_SIZE - 1) /
926 PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
927 (sizeof(struct page *)));
878 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); 928 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
879 929
880 /* generic_write_checks can change our pos */ 930 /* generic_write_checks can change our pos */
881 start_pos = pos; 931 start_pos = pos;
882 932
883 BTRFS_I(inode)->sequence++;
884 first_index = pos >> PAGE_CACHE_SHIFT; 933 first_index = pos >> PAGE_CACHE_SHIFT;
885 last_index = (pos + count) >> PAGE_CACHE_SHIFT; 934 last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT;
886 935
887 /* 936 /*
888 * there are lots of better ways to do this, but this code 937 * there are lots of better ways to do this, but this code
@@ -899,7 +948,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
899 unlock_page(pinned[0]); 948 unlock_page(pinned[0]);
900 } 949 }
901 } 950 }
902 if ((pos + count) & (PAGE_CACHE_SIZE - 1)) { 951 if ((pos + iov_iter_count(&i)) & (PAGE_CACHE_SIZE - 1)) {
903 pinned[1] = grab_cache_page(inode->i_mapping, last_index); 952 pinned[1] = grab_cache_page(inode->i_mapping, last_index);
904 if (!PageUptodate(pinned[1])) { 953 if (!PageUptodate(pinned[1])) {
905 ret = btrfs_readpage(NULL, pinned[1]); 954 ret = btrfs_readpage(NULL, pinned[1]);
@@ -910,10 +959,10 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
910 } 959 }
911 } 960 }
912 961
913 while (count > 0) { 962 while (iov_iter_count(&i) > 0) {
914 size_t offset = pos & (PAGE_CACHE_SIZE - 1); 963 size_t offset = pos & (PAGE_CACHE_SIZE - 1);
915 size_t write_bytes = min(count, nrptrs * 964 size_t write_bytes = min(iov_iter_count(&i),
916 (size_t)PAGE_CACHE_SIZE - 965 nrptrs * (size_t)PAGE_CACHE_SIZE -
917 offset); 966 offset);
918 size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >> 967 size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
919 PAGE_CACHE_SHIFT; 968 PAGE_CACHE_SHIFT;
@@ -921,7 +970,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
921 WARN_ON(num_pages > nrptrs); 970 WARN_ON(num_pages > nrptrs);
922 memset(pages, 0, sizeof(struct page *) * nrptrs); 971 memset(pages, 0, sizeof(struct page *) * nrptrs);
923 972
924 ret = btrfs_check_data_free_space(root, inode, write_bytes); 973 ret = btrfs_delalloc_reserve_space(inode, write_bytes);
925 if (ret) 974 if (ret)
926 goto out; 975 goto out;
927 976
@@ -929,26 +978,20 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
929 pos, first_index, last_index, 978 pos, first_index, last_index,
930 write_bytes); 979 write_bytes);
931 if (ret) { 980 if (ret) {
932 btrfs_free_reserved_data_space(root, inode, 981 btrfs_delalloc_release_space(inode, write_bytes);
933 write_bytes);
934 goto out; 982 goto out;
935 } 983 }
936 984
937 ret = btrfs_copy_from_user(pos, num_pages, 985 ret = btrfs_copy_from_user(pos, num_pages,
938 write_bytes, pages, buf); 986 write_bytes, pages, &i);
939 if (ret) { 987 if (ret == 0) {
940 btrfs_free_reserved_data_space(root, inode, 988 dirty_and_release_pages(NULL, root, file, pages,
941 write_bytes); 989 num_pages, pos, write_bytes);
942 btrfs_drop_pages(pages, num_pages);
943 goto out;
944 } 990 }
945 991
946 ret = dirty_and_release_pages(NULL, root, file, pages,
947 num_pages, pos, write_bytes);
948 btrfs_drop_pages(pages, num_pages); 992 btrfs_drop_pages(pages, num_pages);
949 if (ret) { 993 if (ret) {
950 btrfs_free_reserved_data_space(root, inode, 994 btrfs_delalloc_release_space(inode, write_bytes);
951 write_bytes);
952 goto out; 995 goto out;
953 } 996 }
954 997
@@ -964,8 +1007,6 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
964 btrfs_throttle(root); 1007 btrfs_throttle(root);
965 } 1008 }
966 1009
967 buf += write_bytes;
968 count -= write_bytes;
969 pos += write_bytes; 1010 pos += write_bytes;
970 num_written += write_bytes; 1011 num_written += write_bytes;
971 1012
@@ -975,9 +1016,7 @@ out:
975 mutex_unlock(&inode->i_mutex); 1016 mutex_unlock(&inode->i_mutex);
976 if (ret) 1017 if (ret)
977 err = ret; 1018 err = ret;
978 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
979 1019
980out_nolock:
981 kfree(pages); 1020 kfree(pages);
982 if (pinned[0]) 1021 if (pinned[0])
983 page_cache_release(pinned[0]); 1022 page_cache_release(pinned[0]);
@@ -1007,7 +1046,7 @@ out_nolock:
1007 num_written = err; 1046 num_written = err;
1008 1047
1009 if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { 1048 if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
1010 trans = btrfs_start_transaction(root, 1); 1049 trans = btrfs_start_transaction(root, 0);
1011 ret = btrfs_log_dentry_safe(trans, root, 1050 ret = btrfs_log_dentry_safe(trans, root,
1012 file->f_dentry); 1051 file->f_dentry);
1013 if (ret == 0) { 1052 if (ret == 0) {
@@ -1022,7 +1061,7 @@ out_nolock:
1022 btrfs_end_transaction(trans, root); 1061 btrfs_end_transaction(trans, root);
1023 } 1062 }
1024 } 1063 }
1025 if (file->f_flags & O_DIRECT) { 1064 if (file->f_flags & O_DIRECT && buffered) {
1026 invalidate_mapping_pages(inode->i_mapping, 1065 invalidate_mapping_pages(inode->i_mapping,
1027 start_pos >> PAGE_CACHE_SHIFT, 1066 start_pos >> PAGE_CACHE_SHIFT,
1028 (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT); 1067 (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
@@ -1062,8 +1101,9 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
1062 * important optimization for directories because holding the mutex prevents 1101 * important optimization for directories because holding the mutex prevents
1063 * new operations on the dir while we write to disk. 1102 * new operations on the dir while we write to disk.
1064 */ 1103 */
1065int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) 1104int btrfs_sync_file(struct file *file, int datasync)
1066{ 1105{
1106 struct dentry *dentry = file->f_path.dentry;
1067 struct inode *inode = dentry->d_inode; 1107 struct inode *inode = dentry->d_inode;
1068 struct btrfs_root *root = BTRFS_I(inode)->root; 1108 struct btrfs_root *root = BTRFS_I(inode)->root;
1069 int ret = 0; 1109 int ret = 0;
@@ -1103,9 +1143,9 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1103 if (file && file->private_data) 1143 if (file && file->private_data)
1104 btrfs_ioctl_trans_end(file); 1144 btrfs_ioctl_trans_end(file);
1105 1145
1106 trans = btrfs_start_transaction(root, 1); 1146 trans = btrfs_start_transaction(root, 0);
1107 if (!trans) { 1147 if (IS_ERR(trans)) {
1108 ret = -ENOMEM; 1148 ret = PTR_ERR(trans);
1109 goto out; 1149 goto out;
1110 } 1150 }
1111 1151
@@ -1160,7 +1200,7 @@ const struct file_operations btrfs_file_operations = {
1160 .read = do_sync_read, 1200 .read = do_sync_read,
1161 .aio_read = generic_file_aio_read, 1201 .aio_read = generic_file_aio_read,
1162 .splice_read = generic_file_splice_read, 1202 .splice_read = generic_file_splice_read,
1163 .write = btrfs_file_write, 1203 .aio_write = btrfs_file_aio_write,
1164 .mmap = btrfs_file_mmap, 1204 .mmap = btrfs_file_mmap,
1165 .open = generic_file_open, 1205 .open = generic_file_open,
1166 .release = btrfs_release_file, 1206 .release = btrfs_release_file,
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index dd831ed31eea..f488fac04d99 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -18,6 +18,7 @@
18 18
19#include <linux/pagemap.h> 19#include <linux/pagemap.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/slab.h>
21#include <linux/math64.h> 22#include <linux/math64.h>
22#include "ctree.h" 23#include "ctree.h"
23#include "free-space-cache.h" 24#include "free-space-cache.h"
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 72ce3c173d6a..64f1150bb48d 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -49,6 +49,33 @@ static int find_name_in_backref(struct btrfs_path *path, const char *name,
49 return 0; 49 return 0;
50} 50}
51 51
52struct btrfs_inode_ref *
53btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
54 struct btrfs_root *root,
55 struct btrfs_path *path,
56 const char *name, int name_len,
57 u64 inode_objectid, u64 ref_objectid, int mod)
58{
59 struct btrfs_key key;
60 struct btrfs_inode_ref *ref;
61 int ins_len = mod < 0 ? -1 : 0;
62 int cow = mod != 0;
63 int ret;
64
65 key.objectid = inode_objectid;
66 key.type = BTRFS_INODE_REF_KEY;
67 key.offset = ref_objectid;
68
69 ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
70 if (ret < 0)
71 return ERR_PTR(ret);
72 if (ret > 0)
73 return NULL;
74 if (!find_name_in_backref(path, name, name_len, &ref))
75 return NULL;
76 return ref;
77}
78
52int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, 79int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
53 struct btrfs_root *root, 80 struct btrfs_root *root,
54 const char *name, int name_len, 81 const char *name, int name_len,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 02bb099845fd..fa6ccc1bfe2a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -36,6 +36,7 @@
36#include <linux/xattr.h> 36#include <linux/xattr.h>
37#include <linux/posix_acl.h> 37#include <linux/posix_acl.h>
38#include <linux/falloc.h> 38#include <linux/falloc.h>
39#include <linux/slab.h>
39#include "compat.h" 40#include "compat.h"
40#include "ctree.h" 41#include "ctree.h"
41#include "disk-io.h" 42#include "disk-io.h"
@@ -251,6 +252,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
251 inline_len, compressed_size, 252 inline_len, compressed_size,
252 compressed_pages); 253 compressed_pages);
253 BUG_ON(ret); 254 BUG_ON(ret);
255 btrfs_delalloc_release_metadata(inode, end + 1 - start);
254 btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); 256 btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
255 return 0; 257 return 0;
256} 258}
@@ -413,6 +415,7 @@ again:
413 trans = btrfs_join_transaction(root, 1); 415 trans = btrfs_join_transaction(root, 1);
414 BUG_ON(!trans); 416 BUG_ON(!trans);
415 btrfs_set_trans_block_group(trans, inode); 417 btrfs_set_trans_block_group(trans, inode);
418 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
416 419
417 /* lets try to make an inline extent */ 420 /* lets try to make an inline extent */
418 if (ret || total_in < (actual_end - start)) { 421 if (ret || total_in < (actual_end - start)) {
@@ -438,7 +441,6 @@ again:
438 start, end, NULL, 441 start, end, NULL,
439 EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | 442 EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
440 EXTENT_CLEAR_DELALLOC | 443 EXTENT_CLEAR_DELALLOC |
441 EXTENT_CLEAR_ACCOUNTING |
442 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK); 444 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK);
443 445
444 btrfs_end_transaction(trans, root); 446 btrfs_end_transaction(trans, root);
@@ -696,6 +698,38 @@ retry:
696 return 0; 698 return 0;
697} 699}
698 700
701static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
702 u64 num_bytes)
703{
704 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
705 struct extent_map *em;
706 u64 alloc_hint = 0;
707
708 read_lock(&em_tree->lock);
709 em = search_extent_mapping(em_tree, start, num_bytes);
710 if (em) {
711 /*
712 * if block start isn't an actual block number then find the
713 * first block in this inode and use that as a hint. If that
714 * block is also bogus then just don't worry about it.
715 */
716 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
717 free_extent_map(em);
718 em = search_extent_mapping(em_tree, 0, 0);
719 if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
720 alloc_hint = em->block_start;
721 if (em)
722 free_extent_map(em);
723 } else {
724 alloc_hint = em->block_start;
725 free_extent_map(em);
726 }
727 }
728 read_unlock(&em_tree->lock);
729
730 return alloc_hint;
731}
732
699/* 733/*
700 * when extent_io.c finds a delayed allocation range in the file, 734 * when extent_io.c finds a delayed allocation range in the file,
701 * the call backs end up in this code. The basic idea is to 735 * the call backs end up in this code. The basic idea is to
@@ -733,6 +767,7 @@ static noinline int cow_file_range(struct inode *inode,
733 trans = btrfs_join_transaction(root, 1); 767 trans = btrfs_join_transaction(root, 1);
734 BUG_ON(!trans); 768 BUG_ON(!trans);
735 btrfs_set_trans_block_group(trans, inode); 769 btrfs_set_trans_block_group(trans, inode);
770 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
736 771
737 actual_end = min_t(u64, isize, end + 1); 772 actual_end = min_t(u64, isize, end + 1);
738 773
@@ -752,7 +787,6 @@ static noinline int cow_file_range(struct inode *inode,
752 EXTENT_CLEAR_UNLOCK_PAGE | 787 EXTENT_CLEAR_UNLOCK_PAGE |
753 EXTENT_CLEAR_UNLOCK | 788 EXTENT_CLEAR_UNLOCK |
754 EXTENT_CLEAR_DELALLOC | 789 EXTENT_CLEAR_DELALLOC |
755 EXTENT_CLEAR_ACCOUNTING |
756 EXTENT_CLEAR_DIRTY | 790 EXTENT_CLEAR_DIRTY |
757 EXTENT_SET_WRITEBACK | 791 EXTENT_SET_WRITEBACK |
758 EXTENT_END_WRITEBACK); 792 EXTENT_END_WRITEBACK);
@@ -768,35 +802,13 @@ static noinline int cow_file_range(struct inode *inode,
768 BUG_ON(disk_num_bytes > 802 BUG_ON(disk_num_bytes >
769 btrfs_super_total_bytes(&root->fs_info->super_copy)); 803 btrfs_super_total_bytes(&root->fs_info->super_copy));
770 804
771 805 alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
772 read_lock(&BTRFS_I(inode)->extent_tree.lock);
773 em = search_extent_mapping(&BTRFS_I(inode)->extent_tree,
774 start, num_bytes);
775 if (em) {
776 /*
777 * if block start isn't an actual block number then find the
778 * first block in this inode and use that as a hint. If that
779 * block is also bogus then just don't worry about it.
780 */
781 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
782 free_extent_map(em);
783 em = search_extent_mapping(em_tree, 0, 0);
784 if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
785 alloc_hint = em->block_start;
786 if (em)
787 free_extent_map(em);
788 } else {
789 alloc_hint = em->block_start;
790 free_extent_map(em);
791 }
792 }
793 read_unlock(&BTRFS_I(inode)->extent_tree.lock);
794 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); 806 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
795 807
796 while (disk_num_bytes > 0) { 808 while (disk_num_bytes > 0) {
797 unsigned long op; 809 unsigned long op;
798 810
799 cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent); 811 cur_alloc_size = disk_num_bytes;
800 ret = btrfs_reserve_extent(trans, root, cur_alloc_size, 812 ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
801 root->sectorsize, 0, alloc_hint, 813 root->sectorsize, 0, alloc_hint,
802 (u64)-1, &ins, 1); 814 (u64)-1, &ins, 1);
@@ -1173,6 +1185,13 @@ out_check:
1173 num_bytes, num_bytes, type); 1185 num_bytes, num_bytes, type);
1174 BUG_ON(ret); 1186 BUG_ON(ret);
1175 1187
1188 if (root->root_key.objectid ==
1189 BTRFS_DATA_RELOC_TREE_OBJECTID) {
1190 ret = btrfs_reloc_clone_csums(inode, cur_offset,
1191 num_bytes);
1192 BUG_ON(ret);
1193 }
1194
1176 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, 1195 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
1177 cur_offset, cur_offset + num_bytes - 1, 1196 cur_offset, cur_offset + num_bytes - 1,
1178 locked_page, EXTENT_CLEAR_UNLOCK_PAGE | 1197 locked_page, EXTENT_CLEAR_UNLOCK_PAGE |
@@ -1225,36 +1244,13 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1225} 1244}
1226 1245
1227static int btrfs_split_extent_hook(struct inode *inode, 1246static int btrfs_split_extent_hook(struct inode *inode,
1228 struct extent_state *orig, u64 split) 1247 struct extent_state *orig, u64 split)
1229{ 1248{
1230 struct btrfs_root *root = BTRFS_I(inode)->root; 1249 /* not delalloc, ignore it */
1231 u64 size;
1232
1233 if (!(orig->state & EXTENT_DELALLOC)) 1250 if (!(orig->state & EXTENT_DELALLOC))
1234 return 0; 1251 return 0;
1235 1252
1236 size = orig->end - orig->start + 1; 1253 atomic_inc(&BTRFS_I(inode)->outstanding_extents);
1237 if (size > root->fs_info->max_extent) {
1238 u64 num_extents;
1239 u64 new_size;
1240
1241 new_size = orig->end - split + 1;
1242 num_extents = div64_u64(size + root->fs_info->max_extent - 1,
1243 root->fs_info->max_extent);
1244
1245 /*
1246 * if we break a large extent up then leave oustanding_extents
1247 * be, since we've already accounted for the large extent.
1248 */
1249 if (div64_u64(new_size + root->fs_info->max_extent - 1,
1250 root->fs_info->max_extent) < num_extents)
1251 return 0;
1252 }
1253
1254 spin_lock(&BTRFS_I(inode)->accounting_lock);
1255 BTRFS_I(inode)->outstanding_extents++;
1256 spin_unlock(&BTRFS_I(inode)->accounting_lock);
1257
1258 return 0; 1254 return 0;
1259} 1255}
1260 1256
@@ -1268,42 +1264,11 @@ static int btrfs_merge_extent_hook(struct inode *inode,
1268 struct extent_state *new, 1264 struct extent_state *new,
1269 struct extent_state *other) 1265 struct extent_state *other)
1270{ 1266{
1271 struct btrfs_root *root = BTRFS_I(inode)->root;
1272 u64 new_size, old_size;
1273 u64 num_extents;
1274
1275 /* not delalloc, ignore it */ 1267 /* not delalloc, ignore it */
1276 if (!(other->state & EXTENT_DELALLOC)) 1268 if (!(other->state & EXTENT_DELALLOC))
1277 return 0; 1269 return 0;
1278 1270
1279 old_size = other->end - other->start + 1; 1271 atomic_dec(&BTRFS_I(inode)->outstanding_extents);
1280 if (new->start < other->start)
1281 new_size = other->end - new->start + 1;
1282 else
1283 new_size = new->end - other->start + 1;
1284
1285 /* we're not bigger than the max, unreserve the space and go */
1286 if (new_size <= root->fs_info->max_extent) {
1287 spin_lock(&BTRFS_I(inode)->accounting_lock);
1288 BTRFS_I(inode)->outstanding_extents--;
1289 spin_unlock(&BTRFS_I(inode)->accounting_lock);
1290 return 0;
1291 }
1292
1293 /*
1294 * If we grew by another max_extent, just return, we want to keep that
1295 * reserved amount.
1296 */
1297 num_extents = div64_u64(old_size + root->fs_info->max_extent - 1,
1298 root->fs_info->max_extent);
1299 if (div64_u64(new_size + root->fs_info->max_extent - 1,
1300 root->fs_info->max_extent) > num_extents)
1301 return 0;
1302
1303 spin_lock(&BTRFS_I(inode)->accounting_lock);
1304 BTRFS_I(inode)->outstanding_extents--;
1305 spin_unlock(&BTRFS_I(inode)->accounting_lock);
1306
1307 return 0; 1272 return 0;
1308} 1273}
1309 1274
@@ -1312,8 +1277,8 @@ static int btrfs_merge_extent_hook(struct inode *inode,
1312 * bytes in this file, and to maintain the list of inodes that 1277 * bytes in this file, and to maintain the list of inodes that
1313 * have pending delalloc work to be done. 1278 * have pending delalloc work to be done.
1314 */ 1279 */
1315static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, 1280static int btrfs_set_bit_hook(struct inode *inode,
1316 unsigned long old, unsigned long bits) 1281 struct extent_state *state, int *bits)
1317{ 1282{
1318 1283
1319 /* 1284 /*
@@ -1321,16 +1286,18 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
1321 * but in this case, we are only testeing for the DELALLOC 1286 * but in this case, we are only testeing for the DELALLOC
1322 * bit, which is only set or cleared with irqs on 1287 * bit, which is only set or cleared with irqs on
1323 */ 1288 */
1324 if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { 1289 if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1325 struct btrfs_root *root = BTRFS_I(inode)->root; 1290 struct btrfs_root *root = BTRFS_I(inode)->root;
1291 u64 len = state->end + 1 - state->start;
1292
1293 if (*bits & EXTENT_FIRST_DELALLOC)
1294 *bits &= ~EXTENT_FIRST_DELALLOC;
1295 else
1296 atomic_inc(&BTRFS_I(inode)->outstanding_extents);
1326 1297
1327 spin_lock(&BTRFS_I(inode)->accounting_lock);
1328 BTRFS_I(inode)->outstanding_extents++;
1329 spin_unlock(&BTRFS_I(inode)->accounting_lock);
1330 btrfs_delalloc_reserve_space(root, inode, end - start + 1);
1331 spin_lock(&root->fs_info->delalloc_lock); 1298 spin_lock(&root->fs_info->delalloc_lock);
1332 BTRFS_I(inode)->delalloc_bytes += end - start + 1; 1299 BTRFS_I(inode)->delalloc_bytes += len;
1333 root->fs_info->delalloc_bytes += end - start + 1; 1300 root->fs_info->delalloc_bytes += len;
1334 if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1301 if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1335 list_add_tail(&BTRFS_I(inode)->delalloc_inodes, 1302 list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1336 &root->fs_info->delalloc_inodes); 1303 &root->fs_info->delalloc_inodes);
@@ -1344,44 +1311,32 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
1344 * extent_io.c clear_bit_hook, see set_bit_hook for why 1311 * extent_io.c clear_bit_hook, see set_bit_hook for why
1345 */ 1312 */
1346static int btrfs_clear_bit_hook(struct inode *inode, 1313static int btrfs_clear_bit_hook(struct inode *inode,
1347 struct extent_state *state, unsigned long bits) 1314 struct extent_state *state, int *bits)
1348{ 1315{
1349 /* 1316 /*
1350 * set_bit and clear bit hooks normally require _irqsave/restore 1317 * set_bit and clear bit hooks normally require _irqsave/restore
1351 * but in this case, we are only testeing for the DELALLOC 1318 * but in this case, we are only testeing for the DELALLOC
1352 * bit, which is only set or cleared with irqs on 1319 * bit, which is only set or cleared with irqs on
1353 */ 1320 */
1354 if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { 1321 if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1355 struct btrfs_root *root = BTRFS_I(inode)->root; 1322 struct btrfs_root *root = BTRFS_I(inode)->root;
1323 u64 len = state->end + 1 - state->start;
1356 1324
1357 if (bits & EXTENT_DO_ACCOUNTING) { 1325 if (*bits & EXTENT_FIRST_DELALLOC)
1358 spin_lock(&BTRFS_I(inode)->accounting_lock); 1326 *bits &= ~EXTENT_FIRST_DELALLOC;
1359 BTRFS_I(inode)->outstanding_extents--; 1327 else if (!(*bits & EXTENT_DO_ACCOUNTING))
1360 spin_unlock(&BTRFS_I(inode)->accounting_lock); 1328 atomic_dec(&BTRFS_I(inode)->outstanding_extents);
1361 btrfs_unreserve_metadata_for_delalloc(root, inode, 1); 1329
1362 } 1330 if (*bits & EXTENT_DO_ACCOUNTING)
1331 btrfs_delalloc_release_metadata(inode, len);
1332
1333 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID)
1334 btrfs_free_reserved_data_space(inode, len);
1363 1335
1364 spin_lock(&root->fs_info->delalloc_lock); 1336 spin_lock(&root->fs_info->delalloc_lock);
1365 if (state->end - state->start + 1 > 1337 root->fs_info->delalloc_bytes -= len;
1366 root->fs_info->delalloc_bytes) { 1338 BTRFS_I(inode)->delalloc_bytes -= len;
1367 printk(KERN_INFO "btrfs warning: delalloc account " 1339
1368 "%llu %llu\n",
1369 (unsigned long long)
1370 state->end - state->start + 1,
1371 (unsigned long long)
1372 root->fs_info->delalloc_bytes);
1373 btrfs_delalloc_free_space(root, inode, (u64)-1);
1374 root->fs_info->delalloc_bytes = 0;
1375 BTRFS_I(inode)->delalloc_bytes = 0;
1376 } else {
1377 btrfs_delalloc_free_space(root, inode,
1378 state->end -
1379 state->start + 1);
1380 root->fs_info->delalloc_bytes -= state->end -
1381 state->start + 1;
1382 BTRFS_I(inode)->delalloc_bytes -= state->end -
1383 state->start + 1;
1384 }
1385 if (BTRFS_I(inode)->delalloc_bytes == 0 && 1340 if (BTRFS_I(inode)->delalloc_bytes == 0 &&
1386 !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1341 !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1387 list_del_init(&BTRFS_I(inode)->delalloc_inodes); 1342 list_del_init(&BTRFS_I(inode)->delalloc_inodes);
@@ -1430,7 +1385,8 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1430 */ 1385 */
1431static int __btrfs_submit_bio_start(struct inode *inode, int rw, 1386static int __btrfs_submit_bio_start(struct inode *inode, int rw,
1432 struct bio *bio, int mirror_num, 1387 struct bio *bio, int mirror_num,
1433 unsigned long bio_flags) 1388 unsigned long bio_flags,
1389 u64 bio_offset)
1434{ 1390{
1435 struct btrfs_root *root = BTRFS_I(inode)->root; 1391 struct btrfs_root *root = BTRFS_I(inode)->root;
1436 int ret = 0; 1392 int ret = 0;
@@ -1449,7 +1405,8 @@ static int __btrfs_submit_bio_start(struct inode *inode, int rw,
1449 * are inserted into the btree 1405 * are inserted into the btree
1450 */ 1406 */
1451static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio, 1407static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
1452 int mirror_num, unsigned long bio_flags) 1408 int mirror_num, unsigned long bio_flags,
1409 u64 bio_offset)
1453{ 1410{
1454 struct btrfs_root *root = BTRFS_I(inode)->root; 1411 struct btrfs_root *root = BTRFS_I(inode)->root;
1455 return btrfs_map_bio(root, rw, bio, mirror_num, 1); 1412 return btrfs_map_bio(root, rw, bio, mirror_num, 1);
@@ -1460,7 +1417,8 @@ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
1460 * on write, or reading the csums from the tree before a read 1417 * on write, or reading the csums from the tree before a read
1461 */ 1418 */
1462static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, 1419static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1463 int mirror_num, unsigned long bio_flags) 1420 int mirror_num, unsigned long bio_flags,
1421 u64 bio_offset)
1464{ 1422{
1465 struct btrfs_root *root = BTRFS_I(inode)->root; 1423 struct btrfs_root *root = BTRFS_I(inode)->root;
1466 int ret = 0; 1424 int ret = 0;
@@ -1485,7 +1443,8 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1485 /* we're doing a write, do the async checksumming */ 1443 /* we're doing a write, do the async checksumming */
1486 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, 1444 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
1487 inode, rw, bio, mirror_num, 1445 inode, rw, bio, mirror_num,
1488 bio_flags, __btrfs_submit_bio_start, 1446 bio_flags, bio_offset,
1447 __btrfs_submit_bio_start,
1489 __btrfs_submit_bio_done); 1448 __btrfs_submit_bio_done);
1490 } 1449 }
1491 1450
@@ -1566,6 +1525,7 @@ again:
1566 goto again; 1525 goto again;
1567 } 1526 }
1568 1527
1528 BUG();
1569 btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state); 1529 btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state);
1570 ClearPageChecked(page); 1530 ClearPageChecked(page);
1571out: 1531out:
@@ -1696,7 +1656,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1696static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) 1656static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1697{ 1657{
1698 struct btrfs_root *root = BTRFS_I(inode)->root; 1658 struct btrfs_root *root = BTRFS_I(inode)->root;
1699 struct btrfs_trans_handle *trans; 1659 struct btrfs_trans_handle *trans = NULL;
1700 struct btrfs_ordered_extent *ordered_extent = NULL; 1660 struct btrfs_ordered_extent *ordered_extent = NULL;
1701 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 1661 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1702 struct extent_state *cached_state = NULL; 1662 struct extent_state *cached_state = NULL;
@@ -1714,9 +1674,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1714 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1674 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
1715 if (!ret) { 1675 if (!ret) {
1716 trans = btrfs_join_transaction(root, 1); 1676 trans = btrfs_join_transaction(root, 1);
1677 btrfs_set_trans_block_group(trans, inode);
1678 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1717 ret = btrfs_update_inode(trans, root, inode); 1679 ret = btrfs_update_inode(trans, root, inode);
1718 BUG_ON(ret); 1680 BUG_ON(ret);
1719 btrfs_end_transaction(trans, root);
1720 } 1681 }
1721 goto out; 1682 goto out;
1722 } 1683 }
@@ -1726,6 +1687,8 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1726 0, &cached_state, GFP_NOFS); 1687 0, &cached_state, GFP_NOFS);
1727 1688
1728 trans = btrfs_join_transaction(root, 1); 1689 trans = btrfs_join_transaction(root, 1);
1690 btrfs_set_trans_block_group(trans, inode);
1691 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1729 1692
1730 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) 1693 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
1731 compressed = 1; 1694 compressed = 1;
@@ -1757,12 +1720,13 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1757 add_pending_csums(trans, inode, ordered_extent->file_offset, 1720 add_pending_csums(trans, inode, ordered_extent->file_offset,
1758 &ordered_extent->list); 1721 &ordered_extent->list);
1759 1722
1760 /* this also removes the ordered extent from the tree */
1761 btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1723 btrfs_ordered_update_i_size(inode, 0, ordered_extent);
1762 ret = btrfs_update_inode(trans, root, inode); 1724 ret = btrfs_update_inode(trans, root, inode);
1763 BUG_ON(ret); 1725 BUG_ON(ret);
1764 btrfs_end_transaction(trans, root);
1765out: 1726out:
1727 btrfs_delalloc_release_metadata(inode, ordered_extent->len);
1728 if (trans)
1729 btrfs_end_transaction(trans, root);
1766 /* once for us */ 1730 /* once for us */
1767 btrfs_put_ordered_extent(ordered_extent); 1731 btrfs_put_ordered_extent(ordered_extent);
1768 /* once for the tree */ 1732 /* once for the tree */
@@ -1884,7 +1848,7 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
1884 1848
1885 BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio, 1849 BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
1886 failrec->last_mirror, 1850 failrec->last_mirror,
1887 failrec->bio_flags); 1851 failrec->bio_flags, 0);
1888 return 0; 1852 return 0;
1889} 1853}
1890 1854
@@ -2039,32 +2003,196 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
2039} 2003}
2040 2004
2041/* 2005/*
2006 * calculate extra metadata reservation when snapshotting a subvolume
2007 * contains orphan files.
2008 */
2009void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
2010 struct btrfs_pending_snapshot *pending,
2011 u64 *bytes_to_reserve)
2012{
2013 struct btrfs_root *root;
2014 struct btrfs_block_rsv *block_rsv;
2015 u64 num_bytes;
2016 int index;
2017
2018 root = pending->root;
2019 if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
2020 return;
2021
2022 block_rsv = root->orphan_block_rsv;
2023
2024 /* orphan block reservation for the snapshot */
2025 num_bytes = block_rsv->size;
2026
2027 /*
2028 * after the snapshot is created, COWing tree blocks may use more
2029 * space than it frees. So we should make sure there is enough
2030 * reserved space.
2031 */
2032 index = trans->transid & 0x1;
2033 if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
2034 num_bytes += block_rsv->size -
2035 (block_rsv->reserved + block_rsv->freed[index]);
2036 }
2037
2038 *bytes_to_reserve += num_bytes;
2039}
2040
2041void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
2042 struct btrfs_pending_snapshot *pending)
2043{
2044 struct btrfs_root *root = pending->root;
2045 struct btrfs_root *snap = pending->snap;
2046 struct btrfs_block_rsv *block_rsv;
2047 u64 num_bytes;
2048 int index;
2049 int ret;
2050
2051 if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
2052 return;
2053
2054 /* refill source subvolume's orphan block reservation */
2055 block_rsv = root->orphan_block_rsv;
2056 index = trans->transid & 0x1;
2057 if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
2058 num_bytes = block_rsv->size -
2059 (block_rsv->reserved + block_rsv->freed[index]);
2060 ret = btrfs_block_rsv_migrate(&pending->block_rsv,
2061 root->orphan_block_rsv,
2062 num_bytes);
2063 BUG_ON(ret);
2064 }
2065
2066 /* setup orphan block reservation for the snapshot */
2067 block_rsv = btrfs_alloc_block_rsv(snap);
2068 BUG_ON(!block_rsv);
2069
2070 btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
2071 snap->orphan_block_rsv = block_rsv;
2072
2073 num_bytes = root->orphan_block_rsv->size;
2074 ret = btrfs_block_rsv_migrate(&pending->block_rsv,
2075 block_rsv, num_bytes);
2076 BUG_ON(ret);
2077
2078#if 0
2079 /* insert orphan item for the snapshot */
2080 WARN_ON(!root->orphan_item_inserted);
2081 ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
2082 snap->root_key.objectid);
2083 BUG_ON(ret);
2084 snap->orphan_item_inserted = 1;
2085#endif
2086}
2087
2088enum btrfs_orphan_cleanup_state {
2089 ORPHAN_CLEANUP_STARTED = 1,
2090 ORPHAN_CLEANUP_DONE = 2,
2091};
2092
2093/*
2094 * This is called in transaction commmit time. If there are no orphan
2095 * files in the subvolume, it removes orphan item and frees block_rsv
2096 * structure.
2097 */
2098void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
2099 struct btrfs_root *root)
2100{
2101 int ret;
2102
2103 if (!list_empty(&root->orphan_list) ||
2104 root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
2105 return;
2106
2107 if (root->orphan_item_inserted &&
2108 btrfs_root_refs(&root->root_item) > 0) {
2109 ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
2110 root->root_key.objectid);
2111 BUG_ON(ret);
2112 root->orphan_item_inserted = 0;
2113 }
2114
2115 if (root->orphan_block_rsv) {
2116 WARN_ON(root->orphan_block_rsv->size > 0);
2117 btrfs_free_block_rsv(root, root->orphan_block_rsv);
2118 root->orphan_block_rsv = NULL;
2119 }
2120}
2121
2122/*
2042 * This creates an orphan entry for the given inode in case something goes 2123 * This creates an orphan entry for the given inode in case something goes
2043 * wrong in the middle of an unlink/truncate. 2124 * wrong in the middle of an unlink/truncate.
2125 *
2126 * NOTE: caller of this function should reserve 5 units of metadata for
2127 * this function.
2044 */ 2128 */
2045int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) 2129int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2046{ 2130{
2047 struct btrfs_root *root = BTRFS_I(inode)->root; 2131 struct btrfs_root *root = BTRFS_I(inode)->root;
2048 int ret = 0; 2132 struct btrfs_block_rsv *block_rsv = NULL;
2133 int reserve = 0;
2134 int insert = 0;
2135 int ret;
2049 2136
2050 spin_lock(&root->list_lock); 2137 if (!root->orphan_block_rsv) {
2138 block_rsv = btrfs_alloc_block_rsv(root);
2139 BUG_ON(!block_rsv);
2140 }
2051 2141
2052 /* already on the orphan list, we're good */ 2142 spin_lock(&root->orphan_lock);
2053 if (!list_empty(&BTRFS_I(inode)->i_orphan)) { 2143 if (!root->orphan_block_rsv) {
2054 spin_unlock(&root->list_lock); 2144 root->orphan_block_rsv = block_rsv;
2055 return 0; 2145 } else if (block_rsv) {
2146 btrfs_free_block_rsv(root, block_rsv);
2147 block_rsv = NULL;
2056 } 2148 }
2057 2149
2058 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); 2150 if (list_empty(&BTRFS_I(inode)->i_orphan)) {
2151 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
2152#if 0
2153 /*
2154 * For proper ENOSPC handling, we should do orphan
2155 * cleanup when mounting. But this introduces backward
2156 * compatibility issue.
2157 */
2158 if (!xchg(&root->orphan_item_inserted, 1))
2159 insert = 2;
2160 else
2161 insert = 1;
2162#endif
2163 insert = 1;
2164 } else {
2165 WARN_ON(!BTRFS_I(inode)->orphan_meta_reserved);
2166 }
2059 2167
2060 spin_unlock(&root->list_lock); 2168 if (!BTRFS_I(inode)->orphan_meta_reserved) {
2169 BTRFS_I(inode)->orphan_meta_reserved = 1;
2170 reserve = 1;
2171 }
2172 spin_unlock(&root->orphan_lock);
2061 2173
2062 /* 2174 if (block_rsv)
2063 * insert an orphan item to track this unlinked/truncated file 2175 btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
2064 */
2065 ret = btrfs_insert_orphan_item(trans, root, inode->i_ino);
2066 2176
2067 return ret; 2177 /* grab metadata reservation from transaction handle */
2178 if (reserve) {
2179 ret = btrfs_orphan_reserve_metadata(trans, inode);
2180 BUG_ON(ret);
2181 }
2182
2183 /* insert an orphan item to track this unlinked/truncated file */
2184 if (insert >= 1) {
2185 ret = btrfs_insert_orphan_item(trans, root, inode->i_ino);
2186 BUG_ON(ret);
2187 }
2188
2189 /* insert an orphan item to track subvolume contains orphan files */
2190 if (insert >= 2) {
2191 ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
2192 root->root_key.objectid);
2193 BUG_ON(ret);
2194 }
2195 return 0;
2068} 2196}
2069 2197
2070/* 2198/*
@@ -2074,26 +2202,31 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2074int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode) 2202int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
2075{ 2203{
2076 struct btrfs_root *root = BTRFS_I(inode)->root; 2204 struct btrfs_root *root = BTRFS_I(inode)->root;
2205 int delete_item = 0;
2206 int release_rsv = 0;
2077 int ret = 0; 2207 int ret = 0;
2078 2208
2079 spin_lock(&root->list_lock); 2209 spin_lock(&root->orphan_lock);
2080 2210 if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
2081 if (list_empty(&BTRFS_I(inode)->i_orphan)) { 2211 list_del_init(&BTRFS_I(inode)->i_orphan);
2082 spin_unlock(&root->list_lock); 2212 delete_item = 1;
2083 return 0;
2084 } 2213 }
2085 2214
2086 list_del_init(&BTRFS_I(inode)->i_orphan); 2215 if (BTRFS_I(inode)->orphan_meta_reserved) {
2087 if (!trans) { 2216 BTRFS_I(inode)->orphan_meta_reserved = 0;
2088 spin_unlock(&root->list_lock); 2217 release_rsv = 1;
2089 return 0;
2090 } 2218 }
2219 spin_unlock(&root->orphan_lock);
2091 2220
2092 spin_unlock(&root->list_lock); 2221 if (trans && delete_item) {
2222 ret = btrfs_del_orphan_item(trans, root, inode->i_ino);
2223 BUG_ON(ret);
2224 }
2093 2225
2094 ret = btrfs_del_orphan_item(trans, root, inode->i_ino); 2226 if (release_rsv)
2227 btrfs_orphan_release_metadata(inode);
2095 2228
2096 return ret; 2229 return 0;
2097} 2230}
2098 2231
2099/* 2232/*
@@ -2110,7 +2243,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2110 struct inode *inode; 2243 struct inode *inode;
2111 int ret = 0, nr_unlink = 0, nr_truncate = 0; 2244 int ret = 0, nr_unlink = 0, nr_truncate = 0;
2112 2245
2113 if (!xchg(&root->clean_orphans, 0)) 2246 if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
2114 return; 2247 return;
2115 2248
2116 path = btrfs_alloc_path(); 2249 path = btrfs_alloc_path();
@@ -2163,16 +2296,15 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2163 found_key.type = BTRFS_INODE_ITEM_KEY; 2296 found_key.type = BTRFS_INODE_ITEM_KEY;
2164 found_key.offset = 0; 2297 found_key.offset = 0;
2165 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL); 2298 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
2166 if (IS_ERR(inode)) 2299 BUG_ON(IS_ERR(inode));
2167 break;
2168 2300
2169 /* 2301 /*
2170 * add this inode to the orphan list so btrfs_orphan_del does 2302 * add this inode to the orphan list so btrfs_orphan_del does
2171 * the proper thing when we hit it 2303 * the proper thing when we hit it
2172 */ 2304 */
2173 spin_lock(&root->list_lock); 2305 spin_lock(&root->orphan_lock);
2174 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); 2306 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
2175 spin_unlock(&root->list_lock); 2307 spin_unlock(&root->orphan_lock);
2176 2308
2177 /* 2309 /*
2178 * if this is a bad inode, means we actually succeeded in 2310 * if this is a bad inode, means we actually succeeded in
@@ -2181,7 +2313,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2181 * do a destroy_inode 2313 * do a destroy_inode
2182 */ 2314 */
2183 if (is_bad_inode(inode)) { 2315 if (is_bad_inode(inode)) {
2184 trans = btrfs_start_transaction(root, 1); 2316 trans = btrfs_start_transaction(root, 0);
2185 btrfs_orphan_del(trans, inode); 2317 btrfs_orphan_del(trans, inode);
2186 btrfs_end_transaction(trans, root); 2318 btrfs_end_transaction(trans, root);
2187 iput(inode); 2319 iput(inode);
@@ -2199,13 +2331,23 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2199 /* this will do delete_inode and everything for us */ 2331 /* this will do delete_inode and everything for us */
2200 iput(inode); 2332 iput(inode);
2201 } 2333 }
2334 btrfs_free_path(path);
2335
2336 root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
2337
2338 if (root->orphan_block_rsv)
2339 btrfs_block_rsv_release(root, root->orphan_block_rsv,
2340 (u64)-1);
2341
2342 if (root->orphan_block_rsv || root->orphan_item_inserted) {
2343 trans = btrfs_join_transaction(root, 1);
2344 btrfs_end_transaction(trans, root);
2345 }
2202 2346
2203 if (nr_unlink) 2347 if (nr_unlink)
2204 printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink); 2348 printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink);
2205 if (nr_truncate) 2349 if (nr_truncate)
2206 printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate); 2350 printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate);
2207
2208 btrfs_free_path(path);
2209} 2351}
2210 2352
2211/* 2353/*
@@ -2524,44 +2666,217 @@ out:
2524 return ret; 2666 return ret;
2525} 2667}
2526 2668
2527static int btrfs_unlink(struct inode *dir, struct dentry *dentry) 2669/* helper to check if there is any shared block in the path */
2670static int check_path_shared(struct btrfs_root *root,
2671 struct btrfs_path *path)
2672{
2673 struct extent_buffer *eb;
2674 int level;
2675 int ret;
2676 u64 refs;
2677
2678 for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
2679 if (!path->nodes[level])
2680 break;
2681 eb = path->nodes[level];
2682 if (!btrfs_block_can_be_shared(root, eb))
2683 continue;
2684 ret = btrfs_lookup_extent_info(NULL, root, eb->start, eb->len,
2685 &refs, NULL);
2686 if (refs > 1)
2687 return 1;
2688 }
2689 return 0;
2690}
2691
2692/*
2693 * helper to start transaction for unlink and rmdir.
2694 *
2695 * unlink and rmdir are special in btrfs, they do not always free space.
2696 * so in enospc case, we should make sure they will free space before
2697 * allowing them to use the global metadata reservation.
2698 */
2699static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
2700 struct dentry *dentry)
2528{ 2701{
2529 struct btrfs_root *root;
2530 struct btrfs_trans_handle *trans; 2702 struct btrfs_trans_handle *trans;
2703 struct btrfs_root *root = BTRFS_I(dir)->root;
2704 struct btrfs_path *path;
2705 struct btrfs_inode_ref *ref;
2706 struct btrfs_dir_item *di;
2531 struct inode *inode = dentry->d_inode; 2707 struct inode *inode = dentry->d_inode;
2708 u64 index;
2709 int check_link = 1;
2710 int err = -ENOSPC;
2532 int ret; 2711 int ret;
2533 unsigned long nr = 0;
2534 2712
2535 root = BTRFS_I(dir)->root; 2713 trans = btrfs_start_transaction(root, 10);
2714 if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
2715 return trans;
2536 2716
2537 /* 2717 if (inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
2538 * 5 items for unlink inode 2718 return ERR_PTR(-ENOSPC);
2539 * 1 for orphan 2719
2540 */ 2720 /* check if there is someone else holds reference */
2541 ret = btrfs_reserve_metadata_space(root, 6); 2721 if (S_ISDIR(inode->i_mode) && atomic_read(&inode->i_count) > 1)
2542 if (ret) 2722 return ERR_PTR(-ENOSPC);
2543 return ret;
2544 2723
2545 trans = btrfs_start_transaction(root, 1); 2724 if (atomic_read(&inode->i_count) > 2)
2725 return ERR_PTR(-ENOSPC);
2726
2727 if (xchg(&root->fs_info->enospc_unlink, 1))
2728 return ERR_PTR(-ENOSPC);
2729
2730 path = btrfs_alloc_path();
2731 if (!path) {
2732 root->fs_info->enospc_unlink = 0;
2733 return ERR_PTR(-ENOMEM);
2734 }
2735
2736 trans = btrfs_start_transaction(root, 0);
2546 if (IS_ERR(trans)) { 2737 if (IS_ERR(trans)) {
2547 btrfs_unreserve_metadata_space(root, 6); 2738 btrfs_free_path(path);
2548 return PTR_ERR(trans); 2739 root->fs_info->enospc_unlink = 0;
2740 return trans;
2549 } 2741 }
2550 2742
2743 path->skip_locking = 1;
2744 path->search_commit_root = 1;
2745
2746 ret = btrfs_lookup_inode(trans, root, path,
2747 &BTRFS_I(dir)->location, 0);
2748 if (ret < 0) {
2749 err = ret;
2750 goto out;
2751 }
2752 if (ret == 0) {
2753 if (check_path_shared(root, path))
2754 goto out;
2755 } else {
2756 check_link = 0;
2757 }
2758 btrfs_release_path(root, path);
2759
2760 ret = btrfs_lookup_inode(trans, root, path,
2761 &BTRFS_I(inode)->location, 0);
2762 if (ret < 0) {
2763 err = ret;
2764 goto out;
2765 }
2766 if (ret == 0) {
2767 if (check_path_shared(root, path))
2768 goto out;
2769 } else {
2770 check_link = 0;
2771 }
2772 btrfs_release_path(root, path);
2773
2774 if (ret == 0 && S_ISREG(inode->i_mode)) {
2775 ret = btrfs_lookup_file_extent(trans, root, path,
2776 inode->i_ino, (u64)-1, 0);
2777 if (ret < 0) {
2778 err = ret;
2779 goto out;
2780 }
2781 BUG_ON(ret == 0);
2782 if (check_path_shared(root, path))
2783 goto out;
2784 btrfs_release_path(root, path);
2785 }
2786
2787 if (!check_link) {
2788 err = 0;
2789 goto out;
2790 }
2791
2792 di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
2793 dentry->d_name.name, dentry->d_name.len, 0);
2794 if (IS_ERR(di)) {
2795 err = PTR_ERR(di);
2796 goto out;
2797 }
2798 if (di) {
2799 if (check_path_shared(root, path))
2800 goto out;
2801 } else {
2802 err = 0;
2803 goto out;
2804 }
2805 btrfs_release_path(root, path);
2806
2807 ref = btrfs_lookup_inode_ref(trans, root, path,
2808 dentry->d_name.name, dentry->d_name.len,
2809 inode->i_ino, dir->i_ino, 0);
2810 if (IS_ERR(ref)) {
2811 err = PTR_ERR(ref);
2812 goto out;
2813 }
2814 BUG_ON(!ref);
2815 if (check_path_shared(root, path))
2816 goto out;
2817 index = btrfs_inode_ref_index(path->nodes[0], ref);
2818 btrfs_release_path(root, path);
2819
2820 di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, index,
2821 dentry->d_name.name, dentry->d_name.len, 0);
2822 if (IS_ERR(di)) {
2823 err = PTR_ERR(di);
2824 goto out;
2825 }
2826 BUG_ON(ret == -ENOENT);
2827 if (check_path_shared(root, path))
2828 goto out;
2829
2830 err = 0;
2831out:
2832 btrfs_free_path(path);
2833 if (err) {
2834 btrfs_end_transaction(trans, root);
2835 root->fs_info->enospc_unlink = 0;
2836 return ERR_PTR(err);
2837 }
2838
2839 trans->block_rsv = &root->fs_info->global_block_rsv;
2840 return trans;
2841}
2842
2843static void __unlink_end_trans(struct btrfs_trans_handle *trans,
2844 struct btrfs_root *root)
2845{
2846 if (trans->block_rsv == &root->fs_info->global_block_rsv) {
2847 BUG_ON(!root->fs_info->enospc_unlink);
2848 root->fs_info->enospc_unlink = 0;
2849 }
2850 btrfs_end_transaction_throttle(trans, root);
2851}
2852
2853static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
2854{
2855 struct btrfs_root *root = BTRFS_I(dir)->root;
2856 struct btrfs_trans_handle *trans;
2857 struct inode *inode = dentry->d_inode;
2858 int ret;
2859 unsigned long nr = 0;
2860
2861 trans = __unlink_start_trans(dir, dentry);
2862 if (IS_ERR(trans))
2863 return PTR_ERR(trans);
2864
2551 btrfs_set_trans_block_group(trans, dir); 2865 btrfs_set_trans_block_group(trans, dir);
2552 2866
2553 btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0); 2867 btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0);
2554 2868
2555 ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, 2869 ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
2556 dentry->d_name.name, dentry->d_name.len); 2870 dentry->d_name.name, dentry->d_name.len);
2871 BUG_ON(ret);
2557 2872
2558 if (inode->i_nlink == 0) 2873 if (inode->i_nlink == 0) {
2559 ret = btrfs_orphan_add(trans, inode); 2874 ret = btrfs_orphan_add(trans, inode);
2875 BUG_ON(ret);
2876 }
2560 2877
2561 nr = trans->blocks_used; 2878 nr = trans->blocks_used;
2562 2879 __unlink_end_trans(trans, root);
2563 btrfs_end_transaction_throttle(trans, root);
2564 btrfs_unreserve_metadata_space(root, 6);
2565 btrfs_btree_balance_dirty(root, nr); 2880 btrfs_btree_balance_dirty(root, nr);
2566 return ret; 2881 return ret;
2567} 2882}
@@ -2633,7 +2948,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
2633{ 2948{
2634 struct inode *inode = dentry->d_inode; 2949 struct inode *inode = dentry->d_inode;
2635 int err = 0; 2950 int err = 0;
2636 int ret;
2637 struct btrfs_root *root = BTRFS_I(dir)->root; 2951 struct btrfs_root *root = BTRFS_I(dir)->root;
2638 struct btrfs_trans_handle *trans; 2952 struct btrfs_trans_handle *trans;
2639 unsigned long nr = 0; 2953 unsigned long nr = 0;
@@ -2642,15 +2956,9 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
2642 inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) 2956 inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
2643 return -ENOTEMPTY; 2957 return -ENOTEMPTY;
2644 2958
2645 ret = btrfs_reserve_metadata_space(root, 5); 2959 trans = __unlink_start_trans(dir, dentry);
2646 if (ret) 2960 if (IS_ERR(trans))
2647 return ret;
2648
2649 trans = btrfs_start_transaction(root, 1);
2650 if (IS_ERR(trans)) {
2651 btrfs_unreserve_metadata_space(root, 5);
2652 return PTR_ERR(trans); 2961 return PTR_ERR(trans);
2653 }
2654 2962
2655 btrfs_set_trans_block_group(trans, dir); 2963 btrfs_set_trans_block_group(trans, dir);
2656 2964
@@ -2673,12 +2981,9 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
2673 btrfs_i_size_write(inode, 0); 2981 btrfs_i_size_write(inode, 0);
2674out: 2982out:
2675 nr = trans->blocks_used; 2983 nr = trans->blocks_used;
2676 ret = btrfs_end_transaction_throttle(trans, root); 2984 __unlink_end_trans(trans, root);
2677 btrfs_unreserve_metadata_space(root, 5);
2678 btrfs_btree_balance_dirty(root, nr); 2985 btrfs_btree_balance_dirty(root, nr);
2679 2986
2680 if (ret && !err)
2681 err = ret;
2682 return err; 2987 return err;
2683} 2988}
2684 2989
@@ -3075,6 +3380,7 @@ out:
3075 if (pending_del_nr) { 3380 if (pending_del_nr) {
3076 ret = btrfs_del_items(trans, root, path, pending_del_slot, 3381 ret = btrfs_del_items(trans, root, path, pending_del_slot,
3077 pending_del_nr); 3382 pending_del_nr);
3383 BUG_ON(ret);
3078 } 3384 }
3079 btrfs_free_path(path); 3385 btrfs_free_path(path);
3080 return err; 3386 return err;
@@ -3102,11 +3408,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
3102 3408
3103 if ((offset & (blocksize - 1)) == 0) 3409 if ((offset & (blocksize - 1)) == 0)
3104 goto out; 3410 goto out;
3105 ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); 3411 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
3106 if (ret)
3107 goto out;
3108
3109 ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
3110 if (ret) 3412 if (ret)
3111 goto out; 3413 goto out;
3112 3414
@@ -3114,8 +3416,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
3114again: 3416again:
3115 page = grab_cache_page(mapping, index); 3417 page = grab_cache_page(mapping, index);
3116 if (!page) { 3418 if (!page) {
3117 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); 3419 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
3118 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
3119 goto out; 3420 goto out;
3120 } 3421 }
3121 3422
@@ -3178,8 +3479,7 @@ again:
3178 3479
3179out_unlock: 3480out_unlock:
3180 if (ret) 3481 if (ret)
3181 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); 3482 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
3182 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
3183 unlock_page(page); 3483 unlock_page(page);
3184 page_cache_release(page); 3484 page_cache_release(page);
3185out: 3485out:
@@ -3191,7 +3491,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3191 struct btrfs_trans_handle *trans; 3491 struct btrfs_trans_handle *trans;
3192 struct btrfs_root *root = BTRFS_I(inode)->root; 3492 struct btrfs_root *root = BTRFS_I(inode)->root;
3193 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 3493 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3194 struct extent_map *em; 3494 struct extent_map *em = NULL;
3195 struct extent_state *cached_state = NULL; 3495 struct extent_state *cached_state = NULL;
3196 u64 mask = root->sectorsize - 1; 3496 u64 mask = root->sectorsize - 1;
3197 u64 hole_start = (inode->i_size + mask) & ~mask; 3497 u64 hole_start = (inode->i_size + mask) & ~mask;
@@ -3229,11 +3529,11 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3229 u64 hint_byte = 0; 3529 u64 hint_byte = 0;
3230 hole_size = last_byte - cur_offset; 3530 hole_size = last_byte - cur_offset;
3231 3531
3232 err = btrfs_reserve_metadata_space(root, 2); 3532 trans = btrfs_start_transaction(root, 2);
3233 if (err) 3533 if (IS_ERR(trans)) {
3534 err = PTR_ERR(trans);
3234 break; 3535 break;
3235 3536 }
3236 trans = btrfs_start_transaction(root, 1);
3237 btrfs_set_trans_block_group(trans, inode); 3537 btrfs_set_trans_block_group(trans, inode);
3238 3538
3239 err = btrfs_drop_extents(trans, inode, cur_offset, 3539 err = btrfs_drop_extents(trans, inode, cur_offset,
@@ -3251,14 +3551,15 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3251 last_byte - 1, 0); 3551 last_byte - 1, 0);
3252 3552
3253 btrfs_end_transaction(trans, root); 3553 btrfs_end_transaction(trans, root);
3254 btrfs_unreserve_metadata_space(root, 2);
3255 } 3554 }
3256 free_extent_map(em); 3555 free_extent_map(em);
3556 em = NULL;
3257 cur_offset = last_byte; 3557 cur_offset = last_byte;
3258 if (cur_offset >= block_end) 3558 if (cur_offset >= block_end)
3259 break; 3559 break;
3260 } 3560 }
3261 3561
3562 free_extent_map(em);
3262 unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state, 3563 unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state,
3263 GFP_NOFS); 3564 GFP_NOFS);
3264 return err; 3565 return err;
@@ -3285,11 +3586,10 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
3285 } 3586 }
3286 } 3587 }
3287 3588
3288 ret = btrfs_reserve_metadata_space(root, 1); 3589 trans = btrfs_start_transaction(root, 5);
3289 if (ret) 3590 if (IS_ERR(trans))
3290 return ret; 3591 return PTR_ERR(trans);
3291 3592
3292 trans = btrfs_start_transaction(root, 1);
3293 btrfs_set_trans_block_group(trans, inode); 3593 btrfs_set_trans_block_group(trans, inode);
3294 3594
3295 ret = btrfs_orphan_add(trans, inode); 3595 ret = btrfs_orphan_add(trans, inode);
@@ -3297,7 +3597,6 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
3297 3597
3298 nr = trans->blocks_used; 3598 nr = trans->blocks_used;
3299 btrfs_end_transaction(trans, root); 3599 btrfs_end_transaction(trans, root);
3300 btrfs_unreserve_metadata_space(root, 1);
3301 btrfs_btree_balance_dirty(root, nr); 3600 btrfs_btree_balance_dirty(root, nr);
3302 3601
3303 if (attr->ia_size > inode->i_size) { 3602 if (attr->ia_size > inode->i_size) {
@@ -3310,8 +3609,11 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
3310 i_size_write(inode, attr->ia_size); 3609 i_size_write(inode, attr->ia_size);
3311 btrfs_ordered_update_i_size(inode, inode->i_size, NULL); 3610 btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
3312 3611
3313 trans = btrfs_start_transaction(root, 1); 3612 trans = btrfs_start_transaction(root, 0);
3613 BUG_ON(IS_ERR(trans));
3314 btrfs_set_trans_block_group(trans, inode); 3614 btrfs_set_trans_block_group(trans, inode);
3615 trans->block_rsv = root->orphan_block_rsv;
3616 BUG_ON(!trans->block_rsv);
3315 3617
3316 ret = btrfs_update_inode(trans, root, inode); 3618 ret = btrfs_update_inode(trans, root, inode);
3317 BUG_ON(ret); 3619 BUG_ON(ret);
@@ -3391,10 +3693,21 @@ void btrfs_delete_inode(struct inode *inode)
3391 btrfs_i_size_write(inode, 0); 3693 btrfs_i_size_write(inode, 0);
3392 3694
3393 while (1) { 3695 while (1) {
3394 trans = btrfs_start_transaction(root, 1); 3696 trans = btrfs_start_transaction(root, 0);
3697 BUG_ON(IS_ERR(trans));
3395 btrfs_set_trans_block_group(trans, inode); 3698 btrfs_set_trans_block_group(trans, inode);
3396 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); 3699 trans->block_rsv = root->orphan_block_rsv;
3397 3700
3701 ret = btrfs_block_rsv_check(trans, root,
3702 root->orphan_block_rsv, 0, 5);
3703 if (ret) {
3704 BUG_ON(ret != -EAGAIN);
3705 ret = btrfs_commit_transaction(trans, root);
3706 BUG_ON(ret);
3707 continue;
3708 }
3709
3710 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
3398 if (ret != -EAGAIN) 3711 if (ret != -EAGAIN)
3399 break; 3712 break;
3400 3713
@@ -3402,6 +3715,7 @@ void btrfs_delete_inode(struct inode *inode)
3402 btrfs_end_transaction(trans, root); 3715 btrfs_end_transaction(trans, root);
3403 trans = NULL; 3716 trans = NULL;
3404 btrfs_btree_balance_dirty(root, nr); 3717 btrfs_btree_balance_dirty(root, nr);
3718
3405 } 3719 }
3406 3720
3407 if (ret == 0) { 3721 if (ret == 0) {
@@ -3642,40 +3956,10 @@ again:
3642 return 0; 3956 return 0;
3643} 3957}
3644 3958
3645static noinline void init_btrfs_i(struct inode *inode)
3646{
3647 struct btrfs_inode *bi = BTRFS_I(inode);
3648
3649 bi->generation = 0;
3650 bi->sequence = 0;
3651 bi->last_trans = 0;
3652 bi->last_sub_trans = 0;
3653 bi->logged_trans = 0;
3654 bi->delalloc_bytes = 0;
3655 bi->reserved_bytes = 0;
3656 bi->disk_i_size = 0;
3657 bi->flags = 0;
3658 bi->index_cnt = (u64)-1;
3659 bi->last_unlink_trans = 0;
3660 bi->ordered_data_close = 0;
3661 bi->force_compress = 0;
3662 extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
3663 extent_io_tree_init(&BTRFS_I(inode)->io_tree,
3664 inode->i_mapping, GFP_NOFS);
3665 extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
3666 inode->i_mapping, GFP_NOFS);
3667 INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
3668 INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations);
3669 RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
3670 btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
3671 mutex_init(&BTRFS_I(inode)->log_mutex);
3672}
3673
3674static int btrfs_init_locked_inode(struct inode *inode, void *p) 3959static int btrfs_init_locked_inode(struct inode *inode, void *p)
3675{ 3960{
3676 struct btrfs_iget_args *args = p; 3961 struct btrfs_iget_args *args = p;
3677 inode->i_ino = args->ino; 3962 inode->i_ino = args->ino;
3678 init_btrfs_i(inode);
3679 BTRFS_I(inode)->root = args->root; 3963 BTRFS_I(inode)->root = args->root;
3680 btrfs_set_inode_space_info(args->root, inode); 3964 btrfs_set_inode_space_info(args->root, inode);
3681 return 0; 3965 return 0;
@@ -3738,8 +4022,6 @@ static struct inode *new_simple_dir(struct super_block *s,
3738 if (!inode) 4022 if (!inode)
3739 return ERR_PTR(-ENOMEM); 4023 return ERR_PTR(-ENOMEM);
3740 4024
3741 init_btrfs_i(inode);
3742
3743 BTRFS_I(inode)->root = root; 4025 BTRFS_I(inode)->root = root;
3744 memcpy(&BTRFS_I(inode)->location, key, sizeof(*key)); 4026 memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
3745 BTRFS_I(inode)->dummy_inode = 1; 4027 BTRFS_I(inode)->dummy_inode = 1;
@@ -3996,7 +4278,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
3996 struct btrfs_trans_handle *trans; 4278 struct btrfs_trans_handle *trans;
3997 int ret = 0; 4279 int ret = 0;
3998 4280
3999 if (root->fs_info->btree_inode == inode) 4281 if (BTRFS_I(inode)->dummy_inode)
4000 return 0; 4282 return 0;
4001 4283
4002 if (wbc->sync_mode == WB_SYNC_ALL) { 4284 if (wbc->sync_mode == WB_SYNC_ALL) {
@@ -4017,10 +4299,38 @@ void btrfs_dirty_inode(struct inode *inode)
4017{ 4299{
4018 struct btrfs_root *root = BTRFS_I(inode)->root; 4300 struct btrfs_root *root = BTRFS_I(inode)->root;
4019 struct btrfs_trans_handle *trans; 4301 struct btrfs_trans_handle *trans;
4302 int ret;
4303
4304 if (BTRFS_I(inode)->dummy_inode)
4305 return;
4020 4306
4021 trans = btrfs_join_transaction(root, 1); 4307 trans = btrfs_join_transaction(root, 1);
4022 btrfs_set_trans_block_group(trans, inode); 4308 btrfs_set_trans_block_group(trans, inode);
4023 btrfs_update_inode(trans, root, inode); 4309
4310 ret = btrfs_update_inode(trans, root, inode);
4311 if (ret && ret == -ENOSPC) {
4312 /* whoops, lets try again with the full transaction */
4313 btrfs_end_transaction(trans, root);
4314 trans = btrfs_start_transaction(root, 1);
4315 if (IS_ERR(trans)) {
4316 if (printk_ratelimit()) {
4317 printk(KERN_ERR "btrfs: fail to "
4318 "dirty inode %lu error %ld\n",
4319 inode->i_ino, PTR_ERR(trans));
4320 }
4321 return;
4322 }
4323 btrfs_set_trans_block_group(trans, inode);
4324
4325 ret = btrfs_update_inode(trans, root, inode);
4326 if (ret) {
4327 if (printk_ratelimit()) {
4328 printk(KERN_ERR "btrfs: fail to "
4329 "dirty inode %lu error %d\n",
4330 inode->i_ino, ret);
4331 }
4332 }
4333 }
4024 btrfs_end_transaction(trans, root); 4334 btrfs_end_transaction(trans, root);
4025} 4335}
4026 4336
@@ -4138,7 +4448,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4138 * btrfs_get_inode_index_count has an explanation for the magic 4448 * btrfs_get_inode_index_count has an explanation for the magic
4139 * number 4449 * number
4140 */ 4450 */
4141 init_btrfs_i(inode);
4142 BTRFS_I(inode)->index_cnt = 2; 4451 BTRFS_I(inode)->index_cnt = 2;
4143 BTRFS_I(inode)->root = root; 4452 BTRFS_I(inode)->root = root;
4144 BTRFS_I(inode)->generation = trans->transid; 4453 BTRFS_I(inode)->generation = trans->transid;
@@ -4167,16 +4476,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4167 if (ret != 0) 4476 if (ret != 0)
4168 goto fail; 4477 goto fail;
4169 4478
4170 inode->i_uid = current_fsuid(); 4479 inode_init_owner(inode, dir, mode);
4171
4172 if (dir && (dir->i_mode & S_ISGID)) {
4173 inode->i_gid = dir->i_gid;
4174 if (S_ISDIR(mode))
4175 mode |= S_ISGID;
4176 } else
4177 inode->i_gid = current_fsgid();
4178
4179 inode->i_mode = mode;
4180 inode->i_ino = objectid; 4480 inode->i_ino = objectid;
4181 inode_set_bytes(inode, 0); 4481 inode_set_bytes(inode, 0);
4182 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 4482 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
@@ -4302,26 +4602,21 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4302 if (!new_valid_dev(rdev)) 4602 if (!new_valid_dev(rdev))
4303 return -EINVAL; 4603 return -EINVAL;
4304 4604
4605 err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
4606 if (err)
4607 return err;
4608
4305 /* 4609 /*
4306 * 2 for inode item and ref 4610 * 2 for inode item and ref
4307 * 2 for dir items 4611 * 2 for dir items
4308 * 1 for xattr if selinux is on 4612 * 1 for xattr if selinux is on
4309 */ 4613 */
4310 err = btrfs_reserve_metadata_space(root, 5); 4614 trans = btrfs_start_transaction(root, 5);
4311 if (err) 4615 if (IS_ERR(trans))
4312 return err; 4616 return PTR_ERR(trans);
4313 4617
4314 trans = btrfs_start_transaction(root, 1);
4315 if (!trans)
4316 goto fail;
4317 btrfs_set_trans_block_group(trans, dir); 4618 btrfs_set_trans_block_group(trans, dir);
4318 4619
4319 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
4320 if (err) {
4321 err = -ENOSPC;
4322 goto out_unlock;
4323 }
4324
4325 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4620 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
4326 dentry->d_name.len, 4621 dentry->d_name.len,
4327 dentry->d_parent->d_inode->i_ino, objectid, 4622 dentry->d_parent->d_inode->i_ino, objectid,
@@ -4350,13 +4645,11 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4350out_unlock: 4645out_unlock:
4351 nr = trans->blocks_used; 4646 nr = trans->blocks_used;
4352 btrfs_end_transaction_throttle(trans, root); 4647 btrfs_end_transaction_throttle(trans, root);
4353fail: 4648 btrfs_btree_balance_dirty(root, nr);
4354 btrfs_unreserve_metadata_space(root, 5);
4355 if (drop_inode) { 4649 if (drop_inode) {
4356 inode_dec_link_count(inode); 4650 inode_dec_link_count(inode);
4357 iput(inode); 4651 iput(inode);
4358 } 4652 }
4359 btrfs_btree_balance_dirty(root, nr);
4360 return err; 4653 return err;
4361} 4654}
4362 4655
@@ -4366,32 +4659,26 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
4366 struct btrfs_trans_handle *trans; 4659 struct btrfs_trans_handle *trans;
4367 struct btrfs_root *root = BTRFS_I(dir)->root; 4660 struct btrfs_root *root = BTRFS_I(dir)->root;
4368 struct inode *inode = NULL; 4661 struct inode *inode = NULL;
4369 int err;
4370 int drop_inode = 0; 4662 int drop_inode = 0;
4663 int err;
4371 unsigned long nr = 0; 4664 unsigned long nr = 0;
4372 u64 objectid; 4665 u64 objectid;
4373 u64 index = 0; 4666 u64 index = 0;
4374 4667
4668 err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
4669 if (err)
4670 return err;
4375 /* 4671 /*
4376 * 2 for inode item and ref 4672 * 2 for inode item and ref
4377 * 2 for dir items 4673 * 2 for dir items
4378 * 1 for xattr if selinux is on 4674 * 1 for xattr if selinux is on
4379 */ 4675 */
4380 err = btrfs_reserve_metadata_space(root, 5); 4676 trans = btrfs_start_transaction(root, 5);
4381 if (err) 4677 if (IS_ERR(trans))
4382 return err; 4678 return PTR_ERR(trans);
4383 4679
4384 trans = btrfs_start_transaction(root, 1);
4385 if (!trans)
4386 goto fail;
4387 btrfs_set_trans_block_group(trans, dir); 4680 btrfs_set_trans_block_group(trans, dir);
4388 4681
4389 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
4390 if (err) {
4391 err = -ENOSPC;
4392 goto out_unlock;
4393 }
4394
4395 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4682 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
4396 dentry->d_name.len, 4683 dentry->d_name.len,
4397 dentry->d_parent->d_inode->i_ino, 4684 dentry->d_parent->d_inode->i_ino,
@@ -4423,8 +4710,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
4423out_unlock: 4710out_unlock:
4424 nr = trans->blocks_used; 4711 nr = trans->blocks_used;
4425 btrfs_end_transaction_throttle(trans, root); 4712 btrfs_end_transaction_throttle(trans, root);
4426fail:
4427 btrfs_unreserve_metadata_space(root, 5);
4428 if (drop_inode) { 4713 if (drop_inode) {
4429 inode_dec_link_count(inode); 4714 inode_dec_link_count(inode);
4430 iput(inode); 4715 iput(inode);
@@ -4451,21 +4736,21 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4451 if (root->objectid != BTRFS_I(inode)->root->objectid) 4736 if (root->objectid != BTRFS_I(inode)->root->objectid)
4452 return -EPERM; 4737 return -EPERM;
4453 4738
4454 /*
4455 * 1 item for inode ref
4456 * 2 items for dir items
4457 */
4458 err = btrfs_reserve_metadata_space(root, 3);
4459 if (err)
4460 return err;
4461
4462 btrfs_inc_nlink(inode); 4739 btrfs_inc_nlink(inode);
4463 4740
4464 err = btrfs_set_inode_index(dir, &index); 4741 err = btrfs_set_inode_index(dir, &index);
4465 if (err) 4742 if (err)
4466 goto fail; 4743 goto fail;
4467 4744
4468 trans = btrfs_start_transaction(root, 1); 4745 /*
4746 * 1 item for inode ref
4747 * 2 items for dir items
4748 */
4749 trans = btrfs_start_transaction(root, 3);
4750 if (IS_ERR(trans)) {
4751 err = PTR_ERR(trans);
4752 goto fail;
4753 }
4469 4754
4470 btrfs_set_trans_block_group(trans, dir); 4755 btrfs_set_trans_block_group(trans, dir);
4471 atomic_inc(&inode->i_count); 4756 atomic_inc(&inode->i_count);
@@ -4484,7 +4769,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4484 nr = trans->blocks_used; 4769 nr = trans->blocks_used;
4485 btrfs_end_transaction_throttle(trans, root); 4770 btrfs_end_transaction_throttle(trans, root);
4486fail: 4771fail:
4487 btrfs_unreserve_metadata_space(root, 3);
4488 if (drop_inode) { 4772 if (drop_inode) {
4489 inode_dec_link_count(inode); 4773 inode_dec_link_count(inode);
4490 iput(inode); 4774 iput(inode);
@@ -4504,28 +4788,20 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
4504 u64 index = 0; 4788 u64 index = 0;
4505 unsigned long nr = 1; 4789 unsigned long nr = 1;
4506 4790
4791 err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
4792 if (err)
4793 return err;
4794
4507 /* 4795 /*
4508 * 2 items for inode and ref 4796 * 2 items for inode and ref
4509 * 2 items for dir items 4797 * 2 items for dir items
4510 * 1 for xattr if selinux is on 4798 * 1 for xattr if selinux is on
4511 */ 4799 */
4512 err = btrfs_reserve_metadata_space(root, 5); 4800 trans = btrfs_start_transaction(root, 5);
4513 if (err) 4801 if (IS_ERR(trans))
4514 return err; 4802 return PTR_ERR(trans);
4515
4516 trans = btrfs_start_transaction(root, 1);
4517 if (!trans) {
4518 err = -ENOMEM;
4519 goto out_unlock;
4520 }
4521 btrfs_set_trans_block_group(trans, dir); 4803 btrfs_set_trans_block_group(trans, dir);
4522 4804
4523 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
4524 if (err) {
4525 err = -ENOSPC;
4526 goto out_fail;
4527 }
4528
4529 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4805 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
4530 dentry->d_name.len, 4806 dentry->d_name.len,
4531 dentry->d_parent->d_inode->i_ino, objectid, 4807 dentry->d_parent->d_inode->i_ino, objectid,
@@ -4565,9 +4841,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
4565out_fail: 4841out_fail:
4566 nr = trans->blocks_used; 4842 nr = trans->blocks_used;
4567 btrfs_end_transaction_throttle(trans, root); 4843 btrfs_end_transaction_throttle(trans, root);
4568
4569out_unlock:
4570 btrfs_unreserve_metadata_space(root, 5);
4571 if (drop_on_err) 4844 if (drop_on_err)
4572 iput(inode); 4845 iput(inode);
4573 btrfs_btree_balance_dirty(root, nr); 4846 btrfs_btree_balance_dirty(root, nr);
@@ -4825,6 +5098,7 @@ again:
4825 } 5098 }
4826 flush_dcache_page(page); 5099 flush_dcache_page(page);
4827 } else if (create && PageUptodate(page)) { 5100 } else if (create && PageUptodate(page)) {
5101 WARN_ON(1);
4828 if (!trans) { 5102 if (!trans) {
4829 kunmap(page); 5103 kunmap(page);
4830 free_extent_map(em); 5104 free_extent_map(em);
@@ -4921,11 +5195,651 @@ out:
4921 return em; 5195 return em;
4922} 5196}
4923 5197
5198static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
5199 u64 start, u64 len)
5200{
5201 struct btrfs_root *root = BTRFS_I(inode)->root;
5202 struct btrfs_trans_handle *trans;
5203 struct extent_map *em;
5204 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
5205 struct btrfs_key ins;
5206 u64 alloc_hint;
5207 int ret;
5208
5209 btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
5210
5211 trans = btrfs_join_transaction(root, 0);
5212 if (!trans)
5213 return ERR_PTR(-ENOMEM);
5214
5215 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
5216
5217 alloc_hint = get_extent_allocation_hint(inode, start, len);
5218 ret = btrfs_reserve_extent(trans, root, len, root->sectorsize, 0,
5219 alloc_hint, (u64)-1, &ins, 1);
5220 if (ret) {
5221 em = ERR_PTR(ret);
5222 goto out;
5223 }
5224
5225 em = alloc_extent_map(GFP_NOFS);
5226 if (!em) {
5227 em = ERR_PTR(-ENOMEM);
5228 goto out;
5229 }
5230
5231 em->start = start;
5232 em->orig_start = em->start;
5233 em->len = ins.offset;
5234
5235 em->block_start = ins.objectid;
5236 em->block_len = ins.offset;
5237 em->bdev = root->fs_info->fs_devices->latest_bdev;
5238 set_bit(EXTENT_FLAG_PINNED, &em->flags);
5239
5240 while (1) {
5241 write_lock(&em_tree->lock);
5242 ret = add_extent_mapping(em_tree, em);
5243 write_unlock(&em_tree->lock);
5244 if (ret != -EEXIST)
5245 break;
5246 btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0);
5247 }
5248
5249 ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
5250 ins.offset, ins.offset, 0);
5251 if (ret) {
5252 btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
5253 em = ERR_PTR(ret);
5254 }
5255out:
5256 btrfs_end_transaction(trans, root);
5257 return em;
5258}
5259
5260/*
5261 * returns 1 when the nocow is safe, < 1 on error, 0 if the
5262 * block must be cow'd
5263 */
5264static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
5265 struct inode *inode, u64 offset, u64 len)
5266{
5267 struct btrfs_path *path;
5268 int ret;
5269 struct extent_buffer *leaf;
5270 struct btrfs_root *root = BTRFS_I(inode)->root;
5271 struct btrfs_file_extent_item *fi;
5272 struct btrfs_key key;
5273 u64 disk_bytenr;
5274 u64 backref_offset;
5275 u64 extent_end;
5276 u64 num_bytes;
5277 int slot;
5278 int found_type;
5279
5280 path = btrfs_alloc_path();
5281 if (!path)
5282 return -ENOMEM;
5283
5284 ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
5285 offset, 0);
5286 if (ret < 0)
5287 goto out;
5288
5289 slot = path->slots[0];
5290 if (ret == 1) {
5291 if (slot == 0) {
5292 /* can't find the item, must cow */
5293 ret = 0;
5294 goto out;
5295 }
5296 slot--;
5297 }
5298 ret = 0;
5299 leaf = path->nodes[0];
5300 btrfs_item_key_to_cpu(leaf, &key, slot);
5301 if (key.objectid != inode->i_ino ||
5302 key.type != BTRFS_EXTENT_DATA_KEY) {
5303 /* not our file or wrong item type, must cow */
5304 goto out;
5305 }
5306
5307 if (key.offset > offset) {
5308 /* Wrong offset, must cow */
5309 goto out;
5310 }
5311
5312 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
5313 found_type = btrfs_file_extent_type(leaf, fi);
5314 if (found_type != BTRFS_FILE_EXTENT_REG &&
5315 found_type != BTRFS_FILE_EXTENT_PREALLOC) {
5316 /* not a regular extent, must cow */
5317 goto out;
5318 }
5319 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
5320 backref_offset = btrfs_file_extent_offset(leaf, fi);
5321
5322 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
5323 if (extent_end < offset + len) {
5324 /* extent doesn't include our full range, must cow */
5325 goto out;
5326 }
5327
5328 if (btrfs_extent_readonly(root, disk_bytenr))
5329 goto out;
5330
5331 /*
5332 * look for other files referencing this extent, if we
5333 * find any we must cow
5334 */
5335 if (btrfs_cross_ref_exist(trans, root, inode->i_ino,
5336 key.offset - backref_offset, disk_bytenr))
5337 goto out;
5338
5339 /*
5340 * adjust disk_bytenr and num_bytes to cover just the bytes
5341 * in this extent we are about to write. If there
5342 * are any csums in that range we have to cow in order
5343 * to keep the csums correct
5344 */
5345 disk_bytenr += backref_offset;
5346 disk_bytenr += offset - key.offset;
5347 num_bytes = min(offset + len, extent_end) - offset;
5348 if (csum_exist_in_range(root, disk_bytenr, num_bytes))
5349 goto out;
5350 /*
5351 * all of the above have passed, it is safe to overwrite this extent
5352 * without cow
5353 */
5354 ret = 1;
5355out:
5356 btrfs_free_path(path);
5357 return ret;
5358}
5359
5360static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
5361 struct buffer_head *bh_result, int create)
5362{
5363 struct extent_map *em;
5364 struct btrfs_root *root = BTRFS_I(inode)->root;
5365 u64 start = iblock << inode->i_blkbits;
5366 u64 len = bh_result->b_size;
5367 struct btrfs_trans_handle *trans;
5368
5369 em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
5370 if (IS_ERR(em))
5371 return PTR_ERR(em);
5372
5373 /*
5374 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
5375 * io. INLINE is special, and we could probably kludge it in here, but
5376 * it's still buffered so for safety lets just fall back to the generic
5377 * buffered path.
5378 *
5379 * For COMPRESSED we _have_ to read the entire extent in so we can
5380 * decompress it, so there will be buffering required no matter what we
5381 * do, so go ahead and fallback to buffered.
5382 *
5383 * We return -ENOTBLK because thats what makes DIO go ahead and go back
5384 * to buffered IO. Don't blame me, this is the price we pay for using
5385 * the generic code.
5386 */
5387 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
5388 em->block_start == EXTENT_MAP_INLINE) {
5389 free_extent_map(em);
5390 return -ENOTBLK;
5391 }
5392
5393 /* Just a good old fashioned hole, return */
5394 if (!create && (em->block_start == EXTENT_MAP_HOLE ||
5395 test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
5396 free_extent_map(em);
5397 /* DIO will do one hole at a time, so just unlock a sector */
5398 unlock_extent(&BTRFS_I(inode)->io_tree, start,
5399 start + root->sectorsize - 1, GFP_NOFS);
5400 return 0;
5401 }
5402
5403 /*
5404 * We don't allocate a new extent in the following cases
5405 *
5406 * 1) The inode is marked as NODATACOW. In this case we'll just use the
5407 * existing extent.
5408 * 2) The extent is marked as PREALLOC. We're good to go here and can
5409 * just use the extent.
5410 *
5411 */
5412 if (!create) {
5413 len = em->len - (start - em->start);
5414 goto map;
5415 }
5416
5417 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
5418 ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
5419 em->block_start != EXTENT_MAP_HOLE)) {
5420 int type;
5421 int ret;
5422 u64 block_start;
5423
5424 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
5425 type = BTRFS_ORDERED_PREALLOC;
5426 else
5427 type = BTRFS_ORDERED_NOCOW;
5428 len = min(len, em->len - (start - em->start));
5429 block_start = em->block_start + (start - em->start);
5430
5431 /*
5432 * we're not going to log anything, but we do need
5433 * to make sure the current transaction stays open
5434 * while we look for nocow cross refs
5435 */
5436 trans = btrfs_join_transaction(root, 0);
5437 if (!trans)
5438 goto must_cow;
5439
5440 if (can_nocow_odirect(trans, inode, start, len) == 1) {
5441 ret = btrfs_add_ordered_extent_dio(inode, start,
5442 block_start, len, len, type);
5443 btrfs_end_transaction(trans, root);
5444 if (ret) {
5445 free_extent_map(em);
5446 return ret;
5447 }
5448 goto unlock;
5449 }
5450 btrfs_end_transaction(trans, root);
5451 }
5452must_cow:
5453 /*
5454 * this will cow the extent, reset the len in case we changed
5455 * it above
5456 */
5457 len = bh_result->b_size;
5458 free_extent_map(em);
5459 em = btrfs_new_extent_direct(inode, start, len);
5460 if (IS_ERR(em))
5461 return PTR_ERR(em);
5462 len = min(len, em->len - (start - em->start));
5463unlock:
5464 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, start + len - 1,
5465 EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DIRTY, 1,
5466 0, NULL, GFP_NOFS);
5467map:
5468 bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
5469 inode->i_blkbits;
5470 bh_result->b_size = len;
5471 bh_result->b_bdev = em->bdev;
5472 set_buffer_mapped(bh_result);
5473 if (create && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
5474 set_buffer_new(bh_result);
5475
5476 free_extent_map(em);
5477
5478 return 0;
5479}
5480
5481struct btrfs_dio_private {
5482 struct inode *inode;
5483 u64 logical_offset;
5484 u64 disk_bytenr;
5485 u64 bytes;
5486 u32 *csums;
5487 void *private;
5488};
5489
5490static void btrfs_endio_direct_read(struct bio *bio, int err)
5491{
5492 struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
5493 struct bio_vec *bvec = bio->bi_io_vec;
5494 struct btrfs_dio_private *dip = bio->bi_private;
5495 struct inode *inode = dip->inode;
5496 struct btrfs_root *root = BTRFS_I(inode)->root;
5497 u64 start;
5498 u32 *private = dip->csums;
5499
5500 start = dip->logical_offset;
5501 do {
5502 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
5503 struct page *page = bvec->bv_page;
5504 char *kaddr;
5505 u32 csum = ~(u32)0;
5506 unsigned long flags;
5507
5508 local_irq_save(flags);
5509 kaddr = kmap_atomic(page, KM_IRQ0);
5510 csum = btrfs_csum_data(root, kaddr + bvec->bv_offset,
5511 csum, bvec->bv_len);
5512 btrfs_csum_final(csum, (char *)&csum);
5513 kunmap_atomic(kaddr, KM_IRQ0);
5514 local_irq_restore(flags);
5515
5516 flush_dcache_page(bvec->bv_page);
5517 if (csum != *private) {
5518 printk(KERN_ERR "btrfs csum failed ino %lu off"
5519 " %llu csum %u private %u\n",
5520 inode->i_ino, (unsigned long long)start,
5521 csum, *private);
5522 err = -EIO;
5523 }
5524 }
5525
5526 start += bvec->bv_len;
5527 private++;
5528 bvec++;
5529 } while (bvec <= bvec_end);
5530
5531 unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
5532 dip->logical_offset + dip->bytes - 1, GFP_NOFS);
5533 bio->bi_private = dip->private;
5534
5535 kfree(dip->csums);
5536 kfree(dip);
5537 dio_end_io(bio, err);
5538}
5539
5540static void btrfs_endio_direct_write(struct bio *bio, int err)
5541{
5542 struct btrfs_dio_private *dip = bio->bi_private;
5543 struct inode *inode = dip->inode;
5544 struct btrfs_root *root = BTRFS_I(inode)->root;
5545 struct btrfs_trans_handle *trans;
5546 struct btrfs_ordered_extent *ordered = NULL;
5547 struct extent_state *cached_state = NULL;
5548 int ret;
5549
5550 if (err)
5551 goto out_done;
5552
5553 ret = btrfs_dec_test_ordered_pending(inode, &ordered,
5554 dip->logical_offset, dip->bytes);
5555 if (!ret)
5556 goto out_done;
5557
5558 BUG_ON(!ordered);
5559
5560 trans = btrfs_join_transaction(root, 1);
5561 if (!trans) {
5562 err = -ENOMEM;
5563 goto out;
5564 }
5565 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
5566
5567 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
5568 ret = btrfs_ordered_update_i_size(inode, 0, ordered);
5569 if (!ret)
5570 ret = btrfs_update_inode(trans, root, inode);
5571 err = ret;
5572 goto out;
5573 }
5574
5575 lock_extent_bits(&BTRFS_I(inode)->io_tree, ordered->file_offset,
5576 ordered->file_offset + ordered->len - 1, 0,
5577 &cached_state, GFP_NOFS);
5578
5579 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
5580 ret = btrfs_mark_extent_written(trans, inode,
5581 ordered->file_offset,
5582 ordered->file_offset +
5583 ordered->len);
5584 if (ret) {
5585 err = ret;
5586 goto out_unlock;
5587 }
5588 } else {
5589 ret = insert_reserved_file_extent(trans, inode,
5590 ordered->file_offset,
5591 ordered->start,
5592 ordered->disk_len,
5593 ordered->len,
5594 ordered->len,
5595 0, 0, 0,
5596 BTRFS_FILE_EXTENT_REG);
5597 unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
5598 ordered->file_offset, ordered->len);
5599 if (ret) {
5600 err = ret;
5601 WARN_ON(1);
5602 goto out_unlock;
5603 }
5604 }
5605
5606 add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
5607 btrfs_ordered_update_i_size(inode, 0, ordered);
5608 btrfs_update_inode(trans, root, inode);
5609out_unlock:
5610 unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset,
5611 ordered->file_offset + ordered->len - 1,
5612 &cached_state, GFP_NOFS);
5613out:
5614 btrfs_delalloc_release_metadata(inode, ordered->len);
5615 btrfs_end_transaction(trans, root);
5616 btrfs_put_ordered_extent(ordered);
5617 btrfs_put_ordered_extent(ordered);
5618out_done:
5619 bio->bi_private = dip->private;
5620
5621 kfree(dip->csums);
5622 kfree(dip);
5623 dio_end_io(bio, err);
5624}
5625
5626static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
5627 struct bio *bio, int mirror_num,
5628 unsigned long bio_flags, u64 offset)
5629{
5630 int ret;
5631 struct btrfs_root *root = BTRFS_I(inode)->root;
5632 ret = btrfs_csum_one_bio(root, inode, bio, offset, 1);
5633 BUG_ON(ret);
5634 return 0;
5635}
5636
5637static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
5638 loff_t file_offset)
5639{
5640 struct btrfs_root *root = BTRFS_I(inode)->root;
5641 struct btrfs_dio_private *dip;
5642 struct bio_vec *bvec = bio->bi_io_vec;
5643 u64 start;
5644 int skip_sum;
5645 int write = rw & (1 << BIO_RW);
5646 int ret = 0;
5647
5648 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
5649
5650 dip = kmalloc(sizeof(*dip), GFP_NOFS);
5651 if (!dip) {
5652 ret = -ENOMEM;
5653 goto free_ordered;
5654 }
5655 dip->csums = NULL;
5656
5657 if (!skip_sum) {
5658 dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS);
5659 if (!dip->csums) {
5660 ret = -ENOMEM;
5661 goto free_ordered;
5662 }
5663 }
5664
5665 dip->private = bio->bi_private;
5666 dip->inode = inode;
5667 dip->logical_offset = file_offset;
5668
5669 start = dip->logical_offset;
5670 dip->bytes = 0;
5671 do {
5672 dip->bytes += bvec->bv_len;
5673 bvec++;
5674 } while (bvec <= (bio->bi_io_vec + bio->bi_vcnt - 1));
5675
5676 dip->disk_bytenr = (u64)bio->bi_sector << 9;
5677 bio->bi_private = dip;
5678
5679 if (write)
5680 bio->bi_end_io = btrfs_endio_direct_write;
5681 else
5682 bio->bi_end_io = btrfs_endio_direct_read;
5683
5684 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
5685 if (ret)
5686 goto out_err;
5687
5688 if (write && !skip_sum) {
5689 ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
5690 inode, rw, bio, 0, 0,
5691 dip->logical_offset,
5692 __btrfs_submit_bio_start_direct_io,
5693 __btrfs_submit_bio_done);
5694 if (ret)
5695 goto out_err;
5696 return;
5697 } else if (!skip_sum)
5698 btrfs_lookup_bio_sums_dio(root, inode, bio,
5699 dip->logical_offset, dip->csums);
5700
5701 ret = btrfs_map_bio(root, rw, bio, 0, 1);
5702 if (ret)
5703 goto out_err;
5704 return;
5705out_err:
5706 kfree(dip->csums);
5707 kfree(dip);
5708free_ordered:
5709 /*
5710 * If this is a write, we need to clean up the reserved space and kill
5711 * the ordered extent.
5712 */
5713 if (write) {
5714 struct btrfs_ordered_extent *ordered;
5715 ordered = btrfs_lookup_ordered_extent(inode,
5716 dip->logical_offset);
5717 if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
5718 !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
5719 btrfs_free_reserved_extent(root, ordered->start,
5720 ordered->disk_len);
5721 btrfs_put_ordered_extent(ordered);
5722 btrfs_put_ordered_extent(ordered);
5723 }
5724 bio_endio(bio, ret);
5725}
5726
5727static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb,
5728 const struct iovec *iov, loff_t offset,
5729 unsigned long nr_segs)
5730{
5731 int seg;
5732 size_t size;
5733 unsigned long addr;
5734 unsigned blocksize_mask = root->sectorsize - 1;
5735 ssize_t retval = -EINVAL;
5736 loff_t end = offset;
5737
5738 if (offset & blocksize_mask)
5739 goto out;
5740
5741 /* Check the memory alignment. Blocks cannot straddle pages */
5742 for (seg = 0; seg < nr_segs; seg++) {
5743 addr = (unsigned long)iov[seg].iov_base;
5744 size = iov[seg].iov_len;
5745 end += size;
5746 if ((addr & blocksize_mask) || (size & blocksize_mask))
5747 goto out;
5748 }
5749 retval = 0;
5750out:
5751 return retval;
5752}
4924static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, 5753static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
4925 const struct iovec *iov, loff_t offset, 5754 const struct iovec *iov, loff_t offset,
4926 unsigned long nr_segs) 5755 unsigned long nr_segs)
4927{ 5756{
4928 return -EINVAL; 5757 struct file *file = iocb->ki_filp;
5758 struct inode *inode = file->f_mapping->host;
5759 struct btrfs_ordered_extent *ordered;
5760 struct extent_state *cached_state = NULL;
5761 u64 lockstart, lockend;
5762 ssize_t ret;
5763 int writing = rw & WRITE;
5764 int write_bits = 0;
5765 size_t count = iov_length(iov, nr_segs);
5766
5767 if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov,
5768 offset, nr_segs)) {
5769 return 0;
5770 }
5771
5772 lockstart = offset;
5773 lockend = offset + count - 1;
5774
5775 if (writing) {
5776 ret = btrfs_delalloc_reserve_space(inode, count);
5777 if (ret)
5778 goto out;
5779 }
5780
5781 while (1) {
5782 lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
5783 0, &cached_state, GFP_NOFS);
5784 /*
5785 * We're concerned with the entire range that we're going to be
5786 * doing DIO to, so we need to make sure theres no ordered
5787 * extents in this range.
5788 */
5789 ordered = btrfs_lookup_ordered_range(inode, lockstart,
5790 lockend - lockstart + 1);
5791 if (!ordered)
5792 break;
5793 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
5794 &cached_state, GFP_NOFS);
5795 btrfs_start_ordered_extent(inode, ordered, 1);
5796 btrfs_put_ordered_extent(ordered);
5797 cond_resched();
5798 }
5799
5800 /*
5801 * we don't use btrfs_set_extent_delalloc because we don't want
5802 * the dirty or uptodate bits
5803 */
5804 if (writing) {
5805 write_bits = EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING;
5806 ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
5807 EXTENT_DELALLOC, 0, NULL, &cached_state,
5808 GFP_NOFS);
5809 if (ret) {
5810 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
5811 lockend, EXTENT_LOCKED | write_bits,
5812 1, 0, &cached_state, GFP_NOFS);
5813 goto out;
5814 }
5815 }
5816
5817 free_extent_state(cached_state);
5818 cached_state = NULL;
5819
5820 ret = __blockdev_direct_IO(rw, iocb, inode,
5821 BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
5822 iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
5823 btrfs_submit_direct, 0);
5824
5825 if (ret < 0 && ret != -EIOCBQUEUED) {
5826 clear_extent_bit(&BTRFS_I(inode)->io_tree, offset,
5827 offset + iov_length(iov, nr_segs) - 1,
5828 EXTENT_LOCKED | write_bits, 1, 0,
5829 &cached_state, GFP_NOFS);
5830 } else if (ret >= 0 && ret < iov_length(iov, nr_segs)) {
5831 /*
5832 * We're falling back to buffered, unlock the section we didn't
5833 * do IO on.
5834 */
5835 clear_extent_bit(&BTRFS_I(inode)->io_tree, offset + ret,
5836 offset + iov_length(iov, nr_segs) - 1,
5837 EXTENT_LOCKED | write_bits, 1, 0,
5838 &cached_state, GFP_NOFS);
5839 }
5840out:
5841 free_extent_state(cached_state);
5842 return ret;
4929} 5843}
4930 5844
4931static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 5845static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
@@ -5089,7 +6003,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
5089 u64 page_start; 6003 u64 page_start;
5090 u64 page_end; 6004 u64 page_end;
5091 6005
5092 ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); 6006 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
5093 if (ret) { 6007 if (ret) {
5094 if (ret == -ENOMEM) 6008 if (ret == -ENOMEM)
5095 ret = VM_FAULT_OOM; 6009 ret = VM_FAULT_OOM;
@@ -5098,13 +6012,6 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
5098 goto out; 6012 goto out;
5099 } 6013 }
5100 6014
5101 ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
5102 if (ret) {
5103 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
5104 ret = VM_FAULT_SIGBUS;
5105 goto out;
5106 }
5107
5108 ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ 6015 ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
5109again: 6016again:
5110 lock_page(page); 6017 lock_page(page);
@@ -5114,7 +6021,6 @@ again:
5114 6021
5115 if ((page->mapping != inode->i_mapping) || 6022 if ((page->mapping != inode->i_mapping) ||
5116 (page_start >= size)) { 6023 (page_start >= size)) {
5117 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
5118 /* page got truncated out from underneath us */ 6024 /* page got truncated out from underneath us */
5119 goto out_unlock; 6025 goto out_unlock;
5120 } 6026 }
@@ -5155,7 +6061,6 @@ again:
5155 unlock_extent_cached(io_tree, page_start, page_end, 6061 unlock_extent_cached(io_tree, page_start, page_end,
5156 &cached_state, GFP_NOFS); 6062 &cached_state, GFP_NOFS);
5157 ret = VM_FAULT_SIGBUS; 6063 ret = VM_FAULT_SIGBUS;
5158 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
5159 goto out_unlock; 6064 goto out_unlock;
5160 } 6065 }
5161 ret = 0; 6066 ret = 0;
@@ -5182,10 +6087,10 @@ again:
5182 unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS); 6087 unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
5183 6088
5184out_unlock: 6089out_unlock:
5185 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
5186 if (!ret) 6090 if (!ret)
5187 return VM_FAULT_LOCKED; 6091 return VM_FAULT_LOCKED;
5188 unlock_page(page); 6092 unlock_page(page);
6093 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
5189out: 6094out:
5190 return ret; 6095 return ret;
5191} 6096}
@@ -5210,8 +6115,10 @@ static void btrfs_truncate(struct inode *inode)
5210 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); 6115 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
5211 btrfs_ordered_update_i_size(inode, inode->i_size, NULL); 6116 btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
5212 6117
5213 trans = btrfs_start_transaction(root, 1); 6118 trans = btrfs_start_transaction(root, 0);
6119 BUG_ON(IS_ERR(trans));
5214 btrfs_set_trans_block_group(trans, inode); 6120 btrfs_set_trans_block_group(trans, inode);
6121 trans->block_rsv = root->orphan_block_rsv;
5215 6122
5216 /* 6123 /*
5217 * setattr is responsible for setting the ordered_data_close flag, 6124 * setattr is responsible for setting the ordered_data_close flag,
@@ -5234,6 +6141,23 @@ static void btrfs_truncate(struct inode *inode)
5234 btrfs_add_ordered_operation(trans, root, inode); 6141 btrfs_add_ordered_operation(trans, root, inode);
5235 6142
5236 while (1) { 6143 while (1) {
6144 if (!trans) {
6145 trans = btrfs_start_transaction(root, 0);
6146 BUG_ON(IS_ERR(trans));
6147 btrfs_set_trans_block_group(trans, inode);
6148 trans->block_rsv = root->orphan_block_rsv;
6149 }
6150
6151 ret = btrfs_block_rsv_check(trans, root,
6152 root->orphan_block_rsv, 0, 5);
6153 if (ret) {
6154 BUG_ON(ret != -EAGAIN);
6155 ret = btrfs_commit_transaction(trans, root);
6156 BUG_ON(ret);
6157 trans = NULL;
6158 continue;
6159 }
6160
5237 ret = btrfs_truncate_inode_items(trans, root, inode, 6161 ret = btrfs_truncate_inode_items(trans, root, inode,
5238 inode->i_size, 6162 inode->i_size,
5239 BTRFS_EXTENT_DATA_KEY); 6163 BTRFS_EXTENT_DATA_KEY);
@@ -5245,10 +6169,8 @@ static void btrfs_truncate(struct inode *inode)
5245 6169
5246 nr = trans->blocks_used; 6170 nr = trans->blocks_used;
5247 btrfs_end_transaction(trans, root); 6171 btrfs_end_transaction(trans, root);
6172 trans = NULL;
5248 btrfs_btree_balance_dirty(root, nr); 6173 btrfs_btree_balance_dirty(root, nr);
5249
5250 trans = btrfs_start_transaction(root, 1);
5251 btrfs_set_trans_block_group(trans, inode);
5252 } 6174 }
5253 6175
5254 if (ret == 0 && inode->i_nlink > 0) { 6176 if (ret == 0 && inode->i_nlink > 0) {
@@ -5309,21 +6231,47 @@ unsigned long btrfs_force_ra(struct address_space *mapping,
5309struct inode *btrfs_alloc_inode(struct super_block *sb) 6231struct inode *btrfs_alloc_inode(struct super_block *sb)
5310{ 6232{
5311 struct btrfs_inode *ei; 6233 struct btrfs_inode *ei;
6234 struct inode *inode;
5312 6235
5313 ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS); 6236 ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
5314 if (!ei) 6237 if (!ei)
5315 return NULL; 6238 return NULL;
6239
6240 ei->root = NULL;
6241 ei->space_info = NULL;
6242 ei->generation = 0;
6243 ei->sequence = 0;
5316 ei->last_trans = 0; 6244 ei->last_trans = 0;
5317 ei->last_sub_trans = 0; 6245 ei->last_sub_trans = 0;
5318 ei->logged_trans = 0; 6246 ei->logged_trans = 0;
5319 ei->outstanding_extents = 0; 6247 ei->delalloc_bytes = 0;
5320 ei->reserved_extents = 0; 6248 ei->reserved_bytes = 0;
5321 ei->root = NULL; 6249 ei->disk_i_size = 0;
6250 ei->flags = 0;
6251 ei->index_cnt = (u64)-1;
6252 ei->last_unlink_trans = 0;
6253
5322 spin_lock_init(&ei->accounting_lock); 6254 spin_lock_init(&ei->accounting_lock);
6255 atomic_set(&ei->outstanding_extents, 0);
6256 ei->reserved_extents = 0;
6257
6258 ei->ordered_data_close = 0;
6259 ei->orphan_meta_reserved = 0;
6260 ei->dummy_inode = 0;
6261 ei->force_compress = 0;
6262
6263 inode = &ei->vfs_inode;
6264 extent_map_tree_init(&ei->extent_tree, GFP_NOFS);
6265 extent_io_tree_init(&ei->io_tree, &inode->i_data, GFP_NOFS);
6266 extent_io_tree_init(&ei->io_failure_tree, &inode->i_data, GFP_NOFS);
6267 mutex_init(&ei->log_mutex);
5323 btrfs_ordered_inode_tree_init(&ei->ordered_tree); 6268 btrfs_ordered_inode_tree_init(&ei->ordered_tree);
5324 INIT_LIST_HEAD(&ei->i_orphan); 6269 INIT_LIST_HEAD(&ei->i_orphan);
6270 INIT_LIST_HEAD(&ei->delalloc_inodes);
5325 INIT_LIST_HEAD(&ei->ordered_operations); 6271 INIT_LIST_HEAD(&ei->ordered_operations);
5326 return &ei->vfs_inode; 6272 RB_CLEAR_NODE(&ei->rb_node);
6273
6274 return inode;
5327} 6275}
5328 6276
5329void btrfs_destroy_inode(struct inode *inode) 6277void btrfs_destroy_inode(struct inode *inode)
@@ -5333,6 +6281,8 @@ void btrfs_destroy_inode(struct inode *inode)
5333 6281
5334 WARN_ON(!list_empty(&inode->i_dentry)); 6282 WARN_ON(!list_empty(&inode->i_dentry));
5335 WARN_ON(inode->i_data.nrpages); 6283 WARN_ON(inode->i_data.nrpages);
6284 WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents));
6285 WARN_ON(BTRFS_I(inode)->reserved_extents);
5336 6286
5337 /* 6287 /*
5338 * This can happen where we create an inode, but somebody else also 6288 * This can happen where we create an inode, but somebody else also
@@ -5353,13 +6303,13 @@ void btrfs_destroy_inode(struct inode *inode)
5353 spin_unlock(&root->fs_info->ordered_extent_lock); 6303 spin_unlock(&root->fs_info->ordered_extent_lock);
5354 } 6304 }
5355 6305
5356 spin_lock(&root->list_lock); 6306 spin_lock(&root->orphan_lock);
5357 if (!list_empty(&BTRFS_I(inode)->i_orphan)) { 6307 if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
5358 printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n", 6308 printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n",
5359 inode->i_ino); 6309 inode->i_ino);
5360 list_del_init(&BTRFS_I(inode)->i_orphan); 6310 list_del_init(&BTRFS_I(inode)->i_orphan);
5361 } 6311 }
5362 spin_unlock(&root->list_lock); 6312 spin_unlock(&root->orphan_lock);
5363 6313
5364 while (1) { 6314 while (1) {
5365 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); 6315 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
@@ -5384,7 +6334,6 @@ free:
5384void btrfs_drop_inode(struct inode *inode) 6334void btrfs_drop_inode(struct inode *inode)
5385{ 6335{
5386 struct btrfs_root *root = BTRFS_I(inode)->root; 6336 struct btrfs_root *root = BTRFS_I(inode)->root;
5387
5388 if (inode->i_nlink > 0 && btrfs_root_refs(&root->root_item) == 0) 6337 if (inode->i_nlink > 0 && btrfs_root_refs(&root->root_item) == 0)
5389 generic_delete_inode(inode); 6338 generic_delete_inode(inode);
5390 else 6339 else
@@ -5481,19 +6430,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
5481 if (S_ISDIR(old_inode->i_mode) && new_inode && 6430 if (S_ISDIR(old_inode->i_mode) && new_inode &&
5482 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) 6431 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
5483 return -ENOTEMPTY; 6432 return -ENOTEMPTY;
5484
5485 /*
5486 * We want to reserve the absolute worst case amount of items. So if
5487 * both inodes are subvols and we need to unlink them then that would
5488 * require 4 item modifications, but if they are both normal inodes it
5489 * would require 5 item modifications, so we'll assume their normal
5490 * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items
5491 * should cover the worst case number of items we'll modify.
5492 */
5493 ret = btrfs_reserve_metadata_space(root, 11);
5494 if (ret)
5495 return ret;
5496
5497 /* 6433 /*
5498 * we're using rename to replace one file with another. 6434 * we're using rename to replace one file with another.
5499 * and the replacement file is large. Start IO on it now so 6435 * and the replacement file is large. Start IO on it now so
@@ -5506,8 +6442,18 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
5506 /* close the racy window with snapshot create/destroy ioctl */ 6442 /* close the racy window with snapshot create/destroy ioctl */
5507 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) 6443 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
5508 down_read(&root->fs_info->subvol_sem); 6444 down_read(&root->fs_info->subvol_sem);
6445 /*
6446 * We want to reserve the absolute worst case amount of items. So if
6447 * both inodes are subvols and we need to unlink them then that would
6448 * require 4 item modifications, but if they are both normal inodes it
6449 * would require 5 item modifications, so we'll assume their normal
6450 * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items
6451 * should cover the worst case number of items we'll modify.
6452 */
6453 trans = btrfs_start_transaction(root, 20);
6454 if (IS_ERR(trans))
6455 return PTR_ERR(trans);
5509 6456
5510 trans = btrfs_start_transaction(root, 1);
5511 btrfs_set_trans_block_group(trans, new_dir); 6457 btrfs_set_trans_block_group(trans, new_dir);
5512 6458
5513 if (dest != root) 6459 if (dest != root)
@@ -5606,7 +6552,6 @@ out_fail:
5606 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) 6552 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
5607 up_read(&root->fs_info->subvol_sem); 6553 up_read(&root->fs_info->subvol_sem);
5608 6554
5609 btrfs_unreserve_metadata_space(root, 11);
5610 return ret; 6555 return ret;
5611} 6556}
5612 6557
@@ -5658,6 +6603,38 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
5658 return 0; 6603 return 0;
5659} 6604}
5660 6605
6606int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput)
6607{
6608 struct btrfs_inode *binode;
6609 struct inode *inode = NULL;
6610
6611 spin_lock(&root->fs_info->delalloc_lock);
6612 while (!list_empty(&root->fs_info->delalloc_inodes)) {
6613 binode = list_entry(root->fs_info->delalloc_inodes.next,
6614 struct btrfs_inode, delalloc_inodes);
6615 inode = igrab(&binode->vfs_inode);
6616 if (inode) {
6617 list_move_tail(&binode->delalloc_inodes,
6618 &root->fs_info->delalloc_inodes);
6619 break;
6620 }
6621
6622 list_del_init(&binode->delalloc_inodes);
6623 cond_resched_lock(&root->fs_info->delalloc_lock);
6624 }
6625 spin_unlock(&root->fs_info->delalloc_lock);
6626
6627 if (inode) {
6628 write_inode_now(inode, 0);
6629 if (delay_iput)
6630 btrfs_add_delayed_iput(inode);
6631 else
6632 iput(inode);
6633 return 1;
6634 }
6635 return 0;
6636}
6637
5661static int btrfs_symlink(struct inode *dir, struct dentry *dentry, 6638static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
5662 const char *symname) 6639 const char *symname)
5663{ 6640{
@@ -5681,26 +6658,20 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
5681 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) 6658 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
5682 return -ENAMETOOLONG; 6659 return -ENAMETOOLONG;
5683 6660
6661 err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
6662 if (err)
6663 return err;
5684 /* 6664 /*
5685 * 2 items for inode item and ref 6665 * 2 items for inode item and ref
5686 * 2 items for dir items 6666 * 2 items for dir items
5687 * 1 item for xattr if selinux is on 6667 * 1 item for xattr if selinux is on
5688 */ 6668 */
5689 err = btrfs_reserve_metadata_space(root, 5); 6669 trans = btrfs_start_transaction(root, 5);
5690 if (err) 6670 if (IS_ERR(trans))
5691 return err; 6671 return PTR_ERR(trans);
5692 6672
5693 trans = btrfs_start_transaction(root, 1);
5694 if (!trans)
5695 goto out_fail;
5696 btrfs_set_trans_block_group(trans, dir); 6673 btrfs_set_trans_block_group(trans, dir);
5697 6674
5698 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
5699 if (err) {
5700 err = -ENOSPC;
5701 goto out_unlock;
5702 }
5703
5704 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 6675 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
5705 dentry->d_name.len, 6676 dentry->d_name.len,
5706 dentry->d_parent->d_inode->i_ino, objectid, 6677 dentry->d_parent->d_inode->i_ino, objectid,
@@ -5772,8 +6743,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
5772out_unlock: 6743out_unlock:
5773 nr = trans->blocks_used; 6744 nr = trans->blocks_used;
5774 btrfs_end_transaction_throttle(trans, root); 6745 btrfs_end_transaction_throttle(trans, root);
5775out_fail:
5776 btrfs_unreserve_metadata_space(root, 5);
5777 if (drop_inode) { 6746 if (drop_inode) {
5778 inode_dec_link_count(inode); 6747 inode_dec_link_count(inode);
5779 iput(inode); 6748 iput(inode);
@@ -5782,36 +6751,28 @@ out_fail:
5782 return err; 6751 return err;
5783} 6752}
5784 6753
5785static int prealloc_file_range(struct inode *inode, u64 start, u64 end, 6754int btrfs_prealloc_file_range(struct inode *inode, int mode,
5786 u64 alloc_hint, int mode, loff_t actual_len) 6755 u64 start, u64 num_bytes, u64 min_size,
6756 loff_t actual_len, u64 *alloc_hint)
5787{ 6757{
5788 struct btrfs_trans_handle *trans; 6758 struct btrfs_trans_handle *trans;
5789 struct btrfs_root *root = BTRFS_I(inode)->root; 6759 struct btrfs_root *root = BTRFS_I(inode)->root;
5790 struct btrfs_key ins; 6760 struct btrfs_key ins;
5791 u64 alloc_size;
5792 u64 cur_offset = start; 6761 u64 cur_offset = start;
5793 u64 num_bytes = end - start;
5794 int ret = 0; 6762 int ret = 0;
5795 u64 i_size;
5796 6763
5797 while (num_bytes > 0) { 6764 while (num_bytes > 0) {
5798 alloc_size = min(num_bytes, root->fs_info->max_extent); 6765 trans = btrfs_start_transaction(root, 3);
5799 6766 if (IS_ERR(trans)) {
5800 trans = btrfs_start_transaction(root, 1); 6767 ret = PTR_ERR(trans);
5801 6768 break;
5802 ret = btrfs_reserve_extent(trans, root, alloc_size,
5803 root->sectorsize, 0, alloc_hint,
5804 (u64)-1, &ins, 1);
5805 if (ret) {
5806 WARN_ON(1);
5807 goto stop_trans;
5808 } 6769 }
5809 6770
5810 ret = btrfs_reserve_metadata_space(root, 3); 6771 ret = btrfs_reserve_extent(trans, root, num_bytes, min_size,
6772 0, *alloc_hint, (u64)-1, &ins, 1);
5811 if (ret) { 6773 if (ret) {
5812 btrfs_free_reserved_extent(root, ins.objectid, 6774 btrfs_end_transaction(trans, root);
5813 ins.offset); 6775 break;
5814 goto stop_trans;
5815 } 6776 }
5816 6777
5817 ret = insert_reserved_file_extent(trans, inode, 6778 ret = insert_reserved_file_extent(trans, inode,
@@ -5825,34 +6786,27 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
5825 6786
5826 num_bytes -= ins.offset; 6787 num_bytes -= ins.offset;
5827 cur_offset += ins.offset; 6788 cur_offset += ins.offset;
5828 alloc_hint = ins.objectid + ins.offset; 6789 *alloc_hint = ins.objectid + ins.offset;
5829 6790
5830 inode->i_ctime = CURRENT_TIME; 6791 inode->i_ctime = CURRENT_TIME;
5831 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; 6792 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
5832 if (!(mode & FALLOC_FL_KEEP_SIZE) && 6793 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
5833 (actual_len > inode->i_size) && 6794 (actual_len > inode->i_size) &&
5834 (cur_offset > inode->i_size)) { 6795 (cur_offset > inode->i_size)) {
5835
5836 if (cur_offset > actual_len) 6796 if (cur_offset > actual_len)
5837 i_size = actual_len; 6797 i_size_write(inode, actual_len);
5838 else 6798 else
5839 i_size = cur_offset; 6799 i_size_write(inode, cur_offset);
5840 i_size_write(inode, i_size); 6800 i_size_write(inode, cur_offset);
5841 btrfs_ordered_update_i_size(inode, i_size, NULL); 6801 btrfs_ordered_update_i_size(inode, cur_offset, NULL);
5842 } 6802 }
5843 6803
5844 ret = btrfs_update_inode(trans, root, inode); 6804 ret = btrfs_update_inode(trans, root, inode);
5845 BUG_ON(ret); 6805 BUG_ON(ret);
5846 6806
5847 btrfs_end_transaction(trans, root); 6807 btrfs_end_transaction(trans, root);
5848 btrfs_unreserve_metadata_space(root, 3);
5849 } 6808 }
5850 return ret; 6809 return ret;
5851
5852stop_trans:
5853 btrfs_end_transaction(trans, root);
5854 return ret;
5855
5856} 6810}
5857 6811
5858static long btrfs_fallocate(struct inode *inode, int mode, 6812static long btrfs_fallocate(struct inode *inode, int mode,
@@ -5885,8 +6839,7 @@ static long btrfs_fallocate(struct inode *inode, int mode,
5885 goto out; 6839 goto out;
5886 } 6840 }
5887 6841
5888 ret = btrfs_check_data_free_space(BTRFS_I(inode)->root, inode, 6842 ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
5889 alloc_end - alloc_start);
5890 if (ret) 6843 if (ret)
5891 goto out; 6844 goto out;
5892 6845
@@ -5931,16 +6884,16 @@ static long btrfs_fallocate(struct inode *inode, int mode,
5931 if (em->block_start == EXTENT_MAP_HOLE || 6884 if (em->block_start == EXTENT_MAP_HOLE ||
5932 (cur_offset >= inode->i_size && 6885 (cur_offset >= inode->i_size &&
5933 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { 6886 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
5934 ret = prealloc_file_range(inode, 6887 ret = btrfs_prealloc_file_range(inode, 0, cur_offset,
5935 cur_offset, last_byte, 6888 last_byte - cur_offset,
5936 alloc_hint, mode, offset+len); 6889 1 << inode->i_blkbits,
6890 offset + len,
6891 &alloc_hint);
5937 if (ret < 0) { 6892 if (ret < 0) {
5938 free_extent_map(em); 6893 free_extent_map(em);
5939 break; 6894 break;
5940 } 6895 }
5941 } 6896 }
5942 if (em->block_start <= EXTENT_MAP_LAST_BYTE)
5943 alloc_hint = em->block_start;
5944 free_extent_map(em); 6897 free_extent_map(em);
5945 6898
5946 cur_offset = last_byte; 6899 cur_offset = last_byte;
@@ -5952,8 +6905,7 @@ static long btrfs_fallocate(struct inode *inode, int mode,
5952 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, 6905 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
5953 &cached_state, GFP_NOFS); 6906 &cached_state, GFP_NOFS);
5954 6907
5955 btrfs_free_reserved_data_space(BTRFS_I(inode)->root, inode, 6908 btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
5956 alloc_end - alloc_start);
5957out: 6909out:
5958 mutex_unlock(&inode->i_mutex); 6910 mutex_unlock(&inode->i_mutex);
5959 return ret; 6911 return ret;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 2845c6ceecd2..4cdb98cf26de 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -39,6 +39,7 @@
39#include <linux/security.h> 39#include <linux/security.h>
40#include <linux/xattr.h> 40#include <linux/xattr.h>
41#include <linux/vmalloc.h> 41#include <linux/vmalloc.h>
42#include <linux/slab.h>
42#include "compat.h" 43#include "compat.h"
43#include "ctree.h" 44#include "ctree.h"
44#include "disk-io.h" 45#include "disk-io.h"
@@ -48,7 +49,6 @@
48#include "print-tree.h" 49#include "print-tree.h"
49#include "volumes.h" 50#include "volumes.h"
50#include "locking.h" 51#include "locking.h"
51#include "ctree.h"
52 52
53/* Mask out flags that are inappropriate for the given type of inode. */ 53/* Mask out flags that are inappropriate for the given type of inode. */
54static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) 54static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
@@ -239,23 +239,19 @@ static noinline int create_subvol(struct btrfs_root *root,
239 u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; 239 u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
240 u64 index = 0; 240 u64 index = 0;
241 241
242 ret = btrfs_find_free_objectid(NULL, root->fs_info->tree_root,
243 0, &objectid);
244 if (ret)
245 return ret;
242 /* 246 /*
243 * 1 - inode item 247 * 1 - inode item
244 * 2 - refs 248 * 2 - refs
245 * 1 - root item 249 * 1 - root item
246 * 2 - dir items 250 * 2 - dir items
247 */ 251 */
248 ret = btrfs_reserve_metadata_space(root, 6); 252 trans = btrfs_start_transaction(root, 6);
249 if (ret) 253 if (IS_ERR(trans))
250 return ret; 254 return PTR_ERR(trans);
251
252 trans = btrfs_start_transaction(root, 1);
253 BUG_ON(!trans);
254
255 ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root,
256 0, &objectid);
257 if (ret)
258 goto fail;
259 255
260 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 256 leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
261 0, objectid, NULL, 0, 0, 0); 257 0, objectid, NULL, 0, 0, 0);
@@ -345,13 +341,10 @@ fail:
345 err = btrfs_commit_transaction(trans, root); 341 err = btrfs_commit_transaction(trans, root);
346 if (err && !ret) 342 if (err && !ret)
347 ret = err; 343 ret = err;
348
349 btrfs_unreserve_metadata_space(root, 6);
350 return ret; 344 return ret;
351} 345}
352 346
353static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, 347static int create_snapshot(struct btrfs_root *root, struct dentry *dentry)
354 char *name, int namelen)
355{ 348{
356 struct inode *inode; 349 struct inode *inode;
357 struct btrfs_pending_snapshot *pending_snapshot; 350 struct btrfs_pending_snapshot *pending_snapshot;
@@ -361,40 +354,33 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
361 if (!root->ref_cows) 354 if (!root->ref_cows)
362 return -EINVAL; 355 return -EINVAL;
363 356
364 /*
365 * 1 - inode item
366 * 2 - refs
367 * 1 - root item
368 * 2 - dir items
369 */
370 ret = btrfs_reserve_metadata_space(root, 6);
371 if (ret)
372 goto fail;
373
374 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); 357 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
375 if (!pending_snapshot) { 358 if (!pending_snapshot)
376 ret = -ENOMEM; 359 return -ENOMEM;
377 btrfs_unreserve_metadata_space(root, 6); 360
378 goto fail; 361 btrfs_init_block_rsv(&pending_snapshot->block_rsv);
379 }
380 pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS);
381 if (!pending_snapshot->name) {
382 ret = -ENOMEM;
383 kfree(pending_snapshot);
384 btrfs_unreserve_metadata_space(root, 6);
385 goto fail;
386 }
387 memcpy(pending_snapshot->name, name, namelen);
388 pending_snapshot->name[namelen] = '\0';
389 pending_snapshot->dentry = dentry; 362 pending_snapshot->dentry = dentry;
390 trans = btrfs_start_transaction(root, 1);
391 BUG_ON(!trans);
392 pending_snapshot->root = root; 363 pending_snapshot->root = root;
364
365 trans = btrfs_start_transaction(root->fs_info->extent_root, 5);
366 if (IS_ERR(trans)) {
367 ret = PTR_ERR(trans);
368 goto fail;
369 }
370
371 ret = btrfs_snap_reserve_metadata(trans, pending_snapshot);
372 BUG_ON(ret);
373
393 list_add(&pending_snapshot->list, 374 list_add(&pending_snapshot->list,
394 &trans->transaction->pending_snapshots); 375 &trans->transaction->pending_snapshots);
395 ret = btrfs_commit_transaction(trans, root); 376 ret = btrfs_commit_transaction(trans, root->fs_info->extent_root);
396 BUG_ON(ret); 377 BUG_ON(ret);
397 btrfs_unreserve_metadata_space(root, 6); 378
379 ret = pending_snapshot->error;
380 if (ret)
381 goto fail;
382
383 btrfs_orphan_cleanup(pending_snapshot->snap);
398 384
399 inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry); 385 inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry);
400 if (IS_ERR(inode)) { 386 if (IS_ERR(inode)) {
@@ -405,6 +391,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
405 d_instantiate(dentry, inode); 391 d_instantiate(dentry, inode);
406 ret = 0; 392 ret = 0;
407fail: 393fail:
394 kfree(pending_snapshot);
408 return ret; 395 return ret;
409} 396}
410 397
@@ -456,8 +443,7 @@ static noinline int btrfs_mksubvol(struct path *parent,
456 goto out_up_read; 443 goto out_up_read;
457 444
458 if (snap_src) { 445 if (snap_src) {
459 error = create_snapshot(snap_src, dentry, 446 error = create_snapshot(snap_src, dentry);
460 name, namelen);
461 } else { 447 } else {
462 error = create_subvol(BTRFS_I(dir)->root, dentry, 448 error = create_subvol(BTRFS_I(dir)->root, dentry,
463 name, namelen); 449 name, namelen);
@@ -511,7 +497,7 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
511 em = btrfs_get_extent(inode, NULL, 0, start, len, 0); 497 em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
512 unlock_extent(io_tree, start, start + len - 1, GFP_NOFS); 498 unlock_extent(io_tree, start, start + len - 1, GFP_NOFS);
513 499
514 if (!em) 500 if (IS_ERR(em))
515 return 0; 501 return 0;
516 } 502 }
517 503
@@ -601,19 +587,9 @@ static int btrfs_defrag_file(struct file *file,
601 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) 587 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
602 BTRFS_I(inode)->force_compress = 1; 588 BTRFS_I(inode)->force_compress = 1;
603 589
604 ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); 590 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
605 if (ret) { 591 if (ret)
606 ret = -ENOSPC; 592 goto err_unlock;
607 break;
608 }
609
610 ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
611 if (ret) {
612 btrfs_free_reserved_data_space(root, inode,
613 PAGE_CACHE_SIZE);
614 ret = -ENOSPC;
615 break;
616 }
617again: 593again:
618 if (inode->i_size == 0 || 594 if (inode->i_size == 0 ||
619 i > ((inode->i_size - 1) >> PAGE_CACHE_SHIFT)) { 595 i > ((inode->i_size - 1) >> PAGE_CACHE_SHIFT)) {
@@ -622,8 +598,10 @@ again:
622 } 598 }
623 599
624 page = grab_cache_page(inode->i_mapping, i); 600 page = grab_cache_page(inode->i_mapping, i);
625 if (!page) 601 if (!page) {
602 ret = -ENOMEM;
626 goto err_reservations; 603 goto err_reservations;
604 }
627 605
628 if (!PageUptodate(page)) { 606 if (!PageUptodate(page)) {
629 btrfs_readpage(NULL, page); 607 btrfs_readpage(NULL, page);
@@ -631,6 +609,7 @@ again:
631 if (!PageUptodate(page)) { 609 if (!PageUptodate(page)) {
632 unlock_page(page); 610 unlock_page(page);
633 page_cache_release(page); 611 page_cache_release(page);
612 ret = -EIO;
634 goto err_reservations; 613 goto err_reservations;
635 } 614 }
636 } 615 }
@@ -644,8 +623,7 @@ again:
644 wait_on_page_writeback(page); 623 wait_on_page_writeback(page);
645 624
646 if (PageDirty(page)) { 625 if (PageDirty(page)) {
647 btrfs_free_reserved_data_space(root, inode, 626 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
648 PAGE_CACHE_SIZE);
649 goto loop_unlock; 627 goto loop_unlock;
650 } 628 }
651 629
@@ -683,7 +661,6 @@ loop_unlock:
683 page_cache_release(page); 661 page_cache_release(page);
684 mutex_unlock(&inode->i_mutex); 662 mutex_unlock(&inode->i_mutex);
685 663
686 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
687 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1); 664 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
688 i++; 665 i++;
689 } 666 }
@@ -713,9 +690,9 @@ loop_unlock:
713 return 0; 690 return 0;
714 691
715err_reservations: 692err_reservations:
693 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
694err_unlock:
716 mutex_unlock(&inode->i_mutex); 695 mutex_unlock(&inode->i_mutex);
717 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
718 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
719 return ret; 696 return ret;
720} 697}
721 698
@@ -811,7 +788,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
811 device->name, (unsigned long long)new_size); 788 device->name, (unsigned long long)new_size);
812 789
813 if (new_size > old_size) { 790 if (new_size > old_size) {
814 trans = btrfs_start_transaction(root, 1); 791 trans = btrfs_start_transaction(root, 0);
815 ret = btrfs_grow_device(trans, device, new_size); 792 ret = btrfs_grow_device(trans, device, new_size);
816 btrfs_commit_transaction(trans, root); 793 btrfs_commit_transaction(trans, root);
817 } else { 794 } else {
@@ -1212,6 +1189,9 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file,
1212 return -EPERM; 1189 return -EPERM;
1213 1190
1214 args = kmalloc(sizeof(*args), GFP_KERNEL); 1191 args = kmalloc(sizeof(*args), GFP_KERNEL);
1192 if (!args)
1193 return -ENOMEM;
1194
1215 if (copy_from_user(args, argp, sizeof(*args))) { 1195 if (copy_from_user(args, argp, sizeof(*args))) {
1216 kfree(args); 1196 kfree(args);
1217 return -EFAULT; 1197 return -EFAULT;
@@ -1297,7 +1277,13 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
1297 if (err) 1277 if (err)
1298 goto out_up_write; 1278 goto out_up_write;
1299 1279
1300 trans = btrfs_start_transaction(root, 1); 1280 trans = btrfs_start_transaction(root, 0);
1281 if (IS_ERR(trans)) {
1282 err = PTR_ERR(trans);
1283 goto out;
1284 }
1285 trans->block_rsv = &root->fs_info->global_block_rsv;
1286
1301 ret = btrfs_unlink_subvol(trans, root, dir, 1287 ret = btrfs_unlink_subvol(trans, root, dir,
1302 dest->root_key.objectid, 1288 dest->root_key.objectid,
1303 dentry->d_name.name, 1289 dentry->d_name.name,
@@ -1311,10 +1297,12 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
1311 dest->root_item.drop_level = 0; 1297 dest->root_item.drop_level = 0;
1312 btrfs_set_root_refs(&dest->root_item, 0); 1298 btrfs_set_root_refs(&dest->root_item, 0);
1313 1299
1314 ret = btrfs_insert_orphan_item(trans, 1300 if (!xchg(&dest->orphan_item_inserted, 1)) {
1315 root->fs_info->tree_root, 1301 ret = btrfs_insert_orphan_item(trans,
1316 dest->root_key.objectid); 1302 root->fs_info->tree_root,
1317 BUG_ON(ret); 1303 dest->root_key.objectid);
1304 BUG_ON(ret);
1305 }
1318 1306
1319 ret = btrfs_commit_transaction(trans, root); 1307 ret = btrfs_commit_transaction(trans, root);
1320 BUG_ON(ret); 1308 BUG_ON(ret);
@@ -1355,8 +1343,10 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
1355 ret = -EPERM; 1343 ret = -EPERM;
1356 goto out; 1344 goto out;
1357 } 1345 }
1358 btrfs_defrag_root(root, 0); 1346 ret = btrfs_defrag_root(root, 0);
1359 btrfs_defrag_root(root->fs_info->extent_root, 0); 1347 if (ret)
1348 goto out;
1349 ret = btrfs_defrag_root(root->fs_info->extent_root, 0);
1360 break; 1350 break;
1361 case S_IFREG: 1351 case S_IFREG:
1362 if (!(file->f_mode & FMODE_WRITE)) { 1352 if (!(file->f_mode & FMODE_WRITE)) {
@@ -1375,6 +1365,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
1375 sizeof(*range))) { 1365 sizeof(*range))) {
1376 ret = -EFAULT; 1366 ret = -EFAULT;
1377 kfree(range); 1367 kfree(range);
1368 goto out;
1378 } 1369 }
1379 /* compression requires us to start the IO */ 1370 /* compression requires us to start the IO */
1380 if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) { 1371 if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
@@ -1385,9 +1376,11 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
1385 /* the rest are all set to zero by kzalloc */ 1376 /* the rest are all set to zero by kzalloc */
1386 range->len = (u64)-1; 1377 range->len = (u64)-1;
1387 } 1378 }
1388 btrfs_defrag_file(file, range); 1379 ret = btrfs_defrag_file(file, range);
1389 kfree(range); 1380 kfree(range);
1390 break; 1381 break;
1382 default:
1383 ret = -EINVAL;
1391 } 1384 }
1392out: 1385out:
1393 mnt_drop_write(file->f_path.mnt); 1386 mnt_drop_write(file->f_path.mnt);
@@ -1477,12 +1470,17 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1477 ret = -EBADF; 1470 ret = -EBADF;
1478 goto out_drop_write; 1471 goto out_drop_write;
1479 } 1472 }
1473
1480 src = src_file->f_dentry->d_inode; 1474 src = src_file->f_dentry->d_inode;
1481 1475
1482 ret = -EINVAL; 1476 ret = -EINVAL;
1483 if (src == inode) 1477 if (src == inode)
1484 goto out_fput; 1478 goto out_fput;
1485 1479
1480 /* the src must be open for reading */
1481 if (!(src_file->f_mode & FMODE_READ))
1482 goto out_fput;
1483
1486 ret = -EISDIR; 1484 ret = -EISDIR;
1487 if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode)) 1485 if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode))
1488 goto out_fput; 1486 goto out_fput;
@@ -1541,12 +1539,6 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1541 btrfs_wait_ordered_range(src, off, off+len); 1539 btrfs_wait_ordered_range(src, off, off+len);
1542 } 1540 }
1543 1541
1544 trans = btrfs_start_transaction(root, 1);
1545 BUG_ON(!trans);
1546
1547 /* punch hole in destination first */
1548 btrfs_drop_extents(trans, inode, off, off + len, &hint_byte, 1);
1549
1550 /* clone data */ 1542 /* clone data */
1551 key.objectid = src->i_ino; 1543 key.objectid = src->i_ino;
1552 key.type = BTRFS_EXTENT_DATA_KEY; 1544 key.type = BTRFS_EXTENT_DATA_KEY;
@@ -1557,7 +1549,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1557 * note the key will change type as we walk through the 1549 * note the key will change type as we walk through the
1558 * tree. 1550 * tree.
1559 */ 1551 */
1560 ret = btrfs_search_slot(trans, root, &key, path, 0, 0); 1552 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1561 if (ret < 0) 1553 if (ret < 0)
1562 goto out; 1554 goto out;
1563 1555
@@ -1620,12 +1612,31 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1620 new_key.objectid = inode->i_ino; 1612 new_key.objectid = inode->i_ino;
1621 new_key.offset = key.offset + destoff - off; 1613 new_key.offset = key.offset + destoff - off;
1622 1614
1615 trans = btrfs_start_transaction(root, 1);
1616 if (IS_ERR(trans)) {
1617 ret = PTR_ERR(trans);
1618 goto out;
1619 }
1620
1623 if (type == BTRFS_FILE_EXTENT_REG || 1621 if (type == BTRFS_FILE_EXTENT_REG ||
1624 type == BTRFS_FILE_EXTENT_PREALLOC) { 1622 type == BTRFS_FILE_EXTENT_PREALLOC) {
1623 if (off > key.offset) {
1624 datao += off - key.offset;
1625 datal -= off - key.offset;
1626 }
1627
1628 if (key.offset + datal > off + len)
1629 datal = off + len - key.offset;
1630
1631 ret = btrfs_drop_extents(trans, inode,
1632 new_key.offset,
1633 new_key.offset + datal,
1634 &hint_byte, 1);
1635 BUG_ON(ret);
1636
1625 ret = btrfs_insert_empty_item(trans, root, path, 1637 ret = btrfs_insert_empty_item(trans, root, path,
1626 &new_key, size); 1638 &new_key, size);
1627 if (ret) 1639 BUG_ON(ret);
1628 goto out;
1629 1640
1630 leaf = path->nodes[0]; 1641 leaf = path->nodes[0];
1631 slot = path->slots[0]; 1642 slot = path->slots[0];
@@ -1636,14 +1647,6 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1636 extent = btrfs_item_ptr(leaf, slot, 1647 extent = btrfs_item_ptr(leaf, slot,
1637 struct btrfs_file_extent_item); 1648 struct btrfs_file_extent_item);
1638 1649
1639 if (off > key.offset) {
1640 datao += off - key.offset;
1641 datal -= off - key.offset;
1642 }
1643
1644 if (key.offset + datal > off + len)
1645 datal = off + len - key.offset;
1646
1647 /* disko == 0 means it's a hole */ 1650 /* disko == 0 means it's a hole */
1648 if (!disko) 1651 if (!disko)
1649 datao = 0; 1652 datao = 0;
@@ -1674,14 +1677,21 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1674 1677
1675 if (comp && (skip || trim)) { 1678 if (comp && (skip || trim)) {
1676 ret = -EINVAL; 1679 ret = -EINVAL;
1680 btrfs_end_transaction(trans, root);
1677 goto out; 1681 goto out;
1678 } 1682 }
1679 size -= skip + trim; 1683 size -= skip + trim;
1680 datal -= skip + trim; 1684 datal -= skip + trim;
1685
1686 ret = btrfs_drop_extents(trans, inode,
1687 new_key.offset,
1688 new_key.offset + datal,
1689 &hint_byte, 1);
1690 BUG_ON(ret);
1691
1681 ret = btrfs_insert_empty_item(trans, root, path, 1692 ret = btrfs_insert_empty_item(trans, root, path,
1682 &new_key, size); 1693 &new_key, size);
1683 if (ret) 1694 BUG_ON(ret);
1684 goto out;
1685 1695
1686 if (skip) { 1696 if (skip) {
1687 u32 start = 1697 u32 start =
@@ -1699,8 +1709,17 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1699 } 1709 }
1700 1710
1701 btrfs_mark_buffer_dirty(leaf); 1711 btrfs_mark_buffer_dirty(leaf);
1702 } 1712 btrfs_release_path(root, path);
1703 1713
1714 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
1715 if (new_key.offset + datal > inode->i_size)
1716 btrfs_i_size_write(inode,
1717 new_key.offset + datal);
1718 BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
1719 ret = btrfs_update_inode(trans, root, inode);
1720 BUG_ON(ret);
1721 btrfs_end_transaction(trans, root);
1722 }
1704next: 1723next:
1705 btrfs_release_path(root, path); 1724 btrfs_release_path(root, path);
1706 key.offset++; 1725 key.offset++;
@@ -1708,17 +1727,7 @@ next:
1708 ret = 0; 1727 ret = 0;
1709out: 1728out:
1710 btrfs_release_path(root, path); 1729 btrfs_release_path(root, path);
1711 if (ret == 0) {
1712 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
1713 if (destoff + olen > inode->i_size)
1714 btrfs_i_size_write(inode, destoff + olen);
1715 BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
1716 ret = btrfs_update_inode(trans, root, inode);
1717 }
1718 btrfs_end_transaction(trans, root);
1719 unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS); 1730 unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
1720 if (ret)
1721 vmtruncate(inode, 0);
1722out_unlock: 1731out_unlock:
1723 mutex_unlock(&src->i_mutex); 1732 mutex_unlock(&src->i_mutex);
1724 mutex_unlock(&inode->i_mutex); 1733 mutex_unlock(&inode->i_mutex);
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 1c36e5cd8f55..6151f2ea38bb 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -16,7 +16,6 @@
16 * Boston, MA 021110-1307, USA. 16 * Boston, MA 021110-1307, USA.
17 */ 17 */
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/gfp.h>
20#include <linux/pagemap.h> 19#include <linux/pagemap.h>
21#include <linux/spinlock.h> 20#include <linux/spinlock.h>
22#include <linux/page-flags.h> 21#include <linux/page-flags.h>
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index a8ffecd0b491..e56c72bc5add 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -16,7 +16,6 @@
16 * Boston, MA 021110-1307, USA. 16 * Boston, MA 021110-1307, USA.
17 */ 17 */
18 18
19#include <linux/gfp.h>
20#include <linux/slab.h> 19#include <linux/slab.h>
21#include <linux/blkdev.h> 20#include <linux/blkdev.h>
22#include <linux/writeback.h> 21#include <linux/writeback.h>
@@ -125,6 +124,15 @@ static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
125 return 1; 124 return 1;
126} 125}
127 126
127static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
128 u64 len)
129{
130 if (file_offset + len <= entry->file_offset ||
131 entry->file_offset + entry->len <= file_offset)
132 return 0;
133 return 1;
134}
135
128/* 136/*
129 * look find the first ordered struct that has this offset, otherwise 137 * look find the first ordered struct that has this offset, otherwise
130 * the first one less than this offset 138 * the first one less than this offset
@@ -162,8 +170,9 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
162 * The tree is given a single reference on the ordered extent that was 170 * The tree is given a single reference on the ordered extent that was
163 * inserted. 171 * inserted.
164 */ 172 */
165int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, 173static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
166 u64 start, u64 len, u64 disk_len, int type) 174 u64 start, u64 len, u64 disk_len,
175 int type, int dio)
167{ 176{
168 struct btrfs_ordered_inode_tree *tree; 177 struct btrfs_ordered_inode_tree *tree;
169 struct rb_node *node; 178 struct rb_node *node;
@@ -183,6 +192,9 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
183 if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) 192 if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
184 set_bit(type, &entry->flags); 193 set_bit(type, &entry->flags);
185 194
195 if (dio)
196 set_bit(BTRFS_ORDERED_DIRECT, &entry->flags);
197
186 /* one ref for the tree */ 198 /* one ref for the tree */
187 atomic_set(&entry->refs, 1); 199 atomic_set(&entry->refs, 1);
188 init_waitqueue_head(&entry->wait); 200 init_waitqueue_head(&entry->wait);
@@ -204,6 +216,20 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
204 return 0; 216 return 0;
205} 217}
206 218
219int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
220 u64 start, u64 len, u64 disk_len, int type)
221{
222 return __btrfs_add_ordered_extent(inode, file_offset, start, len,
223 disk_len, type, 0);
224}
225
226int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
227 u64 start, u64 len, u64 disk_len, int type)
228{
229 return __btrfs_add_ordered_extent(inode, file_offset, start, len,
230 disk_len, type, 1);
231}
232
207/* 233/*
208 * Add a struct btrfs_ordered_sum into the list of checksums to be inserted 234 * Add a struct btrfs_ordered_sum into the list of checksums to be inserted
209 * when an ordered extent is finished. If the list covers more than one 235 * when an ordered extent is finished. If the list covers more than one
@@ -303,6 +329,7 @@ static int __btrfs_remove_ordered_extent(struct inode *inode,
303 struct btrfs_ordered_extent *entry) 329 struct btrfs_ordered_extent *entry)
304{ 330{
305 struct btrfs_ordered_inode_tree *tree; 331 struct btrfs_ordered_inode_tree *tree;
332 struct btrfs_root *root = BTRFS_I(inode)->root;
306 struct rb_node *node; 333 struct rb_node *node;
307 334
308 tree = &BTRFS_I(inode)->ordered_tree; 335 tree = &BTRFS_I(inode)->ordered_tree;
@@ -311,13 +338,7 @@ static int __btrfs_remove_ordered_extent(struct inode *inode,
311 tree->last = NULL; 338 tree->last = NULL;
312 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); 339 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
313 340
314 spin_lock(&BTRFS_I(inode)->accounting_lock); 341 spin_lock(&root->fs_info->ordered_extent_lock);
315 BTRFS_I(inode)->outstanding_extents--;
316 spin_unlock(&BTRFS_I(inode)->accounting_lock);
317 btrfs_unreserve_metadata_for_delalloc(BTRFS_I(inode)->root,
318 inode, 1);
319
320 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
321 list_del_init(&entry->root_extent_list); 342 list_del_init(&entry->root_extent_list);
322 343
323 /* 344 /*
@@ -329,7 +350,7 @@ static int __btrfs_remove_ordered_extent(struct inode *inode,
329 !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) { 350 !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
330 list_del_init(&BTRFS_I(inode)->ordered_operations); 351 list_del_init(&BTRFS_I(inode)->ordered_operations);
331 } 352 }
332 spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); 353 spin_unlock(&root->fs_info->ordered_extent_lock);
333 354
334 return 0; 355 return 0;
335} 356}
@@ -490,7 +511,8 @@ void btrfs_start_ordered_extent(struct inode *inode,
490 * start IO on any dirty ones so the wait doesn't stall waiting 511 * start IO on any dirty ones so the wait doesn't stall waiting
491 * for pdflush to find them 512 * for pdflush to find them
492 */ 513 */
493 filemap_fdatawrite_range(inode->i_mapping, start, end); 514 if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
515 filemap_fdatawrite_range(inode->i_mapping, start, end);
494 if (wait) { 516 if (wait) {
495 wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, 517 wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
496 &entry->flags)); 518 &entry->flags));
@@ -587,6 +609,47 @@ out:
587 return entry; 609 return entry;
588} 610}
589 611
612/* Since the DIO code tries to lock a wide area we need to look for any ordered
613 * extents that exist in the range, rather than just the start of the range.
614 */
615struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
616 u64 file_offset,
617 u64 len)
618{
619 struct btrfs_ordered_inode_tree *tree;
620 struct rb_node *node;
621 struct btrfs_ordered_extent *entry = NULL;
622
623 tree = &BTRFS_I(inode)->ordered_tree;
624 spin_lock(&tree->lock);
625 node = tree_search(tree, file_offset);
626 if (!node) {
627 node = tree_search(tree, file_offset + len);
628 if (!node)
629 goto out;
630 }
631
632 while (1) {
633 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
634 if (range_overlaps(entry, file_offset, len))
635 break;
636
637 if (entry->file_offset >= file_offset + len) {
638 entry = NULL;
639 break;
640 }
641 entry = NULL;
642 node = rb_next(node);
643 if (!node)
644 break;
645 }
646out:
647 if (entry)
648 atomic_inc(&entry->refs);
649 spin_unlock(&tree->lock);
650 return entry;
651}
652
590/* 653/*
591 * lookup and return any extent before 'file_offset'. NULL is returned 654 * lookup and return any extent before 'file_offset'. NULL is returned
592 * if none is found 655 * if none is found
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index c82f76a9f040..8ac365492a3f 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -72,6 +72,8 @@ struct btrfs_ordered_sum {
72 72
73#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */ 73#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */
74 74
75#define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */
76
75struct btrfs_ordered_extent { 77struct btrfs_ordered_extent {
76 /* logical offset in the file */ 78 /* logical offset in the file */
77 u64 file_offset; 79 u64 file_offset;
@@ -140,7 +142,9 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
140 struct btrfs_ordered_extent **cached, 142 struct btrfs_ordered_extent **cached,
141 u64 file_offset, u64 io_size); 143 u64 file_offset, u64 io_size);
142int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, 144int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
143 u64 start, u64 len, u64 disk_len, int tyep); 145 u64 start, u64 len, u64 disk_len, int type);
146int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
147 u64 start, u64 len, u64 disk_len, int type);
144int btrfs_add_ordered_sum(struct inode *inode, 148int btrfs_add_ordered_sum(struct inode *inode,
145 struct btrfs_ordered_extent *entry, 149 struct btrfs_ordered_extent *entry,
146 struct btrfs_ordered_sum *sum); 150 struct btrfs_ordered_sum *sum);
@@ -151,6 +155,9 @@ void btrfs_start_ordered_extent(struct inode *inode,
151int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); 155int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
152struct btrfs_ordered_extent * 156struct btrfs_ordered_extent *
153btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset); 157btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
158struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
159 u64 file_offset,
160 u64 len);
154int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, 161int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
155 struct btrfs_ordered_extent *ordered); 162 struct btrfs_ordered_extent *ordered);
156int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); 163int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c
index d0cc62bccb94..a97314cf6bd6 100644
--- a/fs/btrfs/ref-cache.c
+++ b/fs/btrfs/ref-cache.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/slab.h>
20#include <linux/sort.h> 21#include <linux/sort.h>
21#include "ctree.h" 22#include "ctree.h"
22#include "ref-cache.h" 23#include "ref-cache.h"
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 0b23942cbc0d..05d41e569236 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -21,6 +21,7 @@
21#include <linux/writeback.h> 21#include <linux/writeback.h>
22#include <linux/blkdev.h> 22#include <linux/blkdev.h>
23#include <linux/rbtree.h> 23#include <linux/rbtree.h>
24#include <linux/slab.h>
24#include "ctree.h" 25#include "ctree.h"
25#include "disk-io.h" 26#include "disk-io.h"
26#include "transaction.h" 27#include "transaction.h"
@@ -43,8 +44,12 @@ struct tree_entry {
43struct backref_node { 44struct backref_node {
44 struct rb_node rb_node; 45 struct rb_node rb_node;
45 u64 bytenr; 46 u64 bytenr;
46 /* objectid tree block owner */ 47
48 u64 new_bytenr;
49 /* objectid of tree block owner, can be not uptodate */
47 u64 owner; 50 u64 owner;
51 /* link to pending, changed or detached list */
52 struct list_head list;
48 /* list of upper level blocks reference this block */ 53 /* list of upper level blocks reference this block */
49 struct list_head upper; 54 struct list_head upper;
50 /* list of child blocks in the cache */ 55 /* list of child blocks in the cache */
@@ -55,9 +60,9 @@ struct backref_node {
55 struct extent_buffer *eb; 60 struct extent_buffer *eb;
56 /* level of tree block */ 61 /* level of tree block */
57 unsigned int level:8; 62 unsigned int level:8;
58 /* 1 if the block is root of old snapshot */ 63 /* is the block in non-reference counted tree */
59 unsigned int old_root:1; 64 unsigned int cowonly:1;
60 /* 1 if no child blocks in the cache */ 65 /* 1 if no child node in the cache */
61 unsigned int lowest:1; 66 unsigned int lowest:1;
62 /* is the extent buffer locked */ 67 /* is the extent buffer locked */
63 unsigned int locked:1; 68 unsigned int locked:1;
@@ -65,6 +70,16 @@ struct backref_node {
65 unsigned int processed:1; 70 unsigned int processed:1;
66 /* have backrefs of this block been checked */ 71 /* have backrefs of this block been checked */
67 unsigned int checked:1; 72 unsigned int checked:1;
73 /*
74 * 1 if corresponding block has been cowed but some upper
75 * level block pointers may not point to the new location
76 */
77 unsigned int pending:1;
78 /*
79 * 1 if the backref node isn't connected to any other
80 * backref node.
81 */
82 unsigned int detached:1;
68}; 83};
69 84
70/* 85/*
@@ -73,7 +88,6 @@ struct backref_node {
73struct backref_edge { 88struct backref_edge {
74 struct list_head list[2]; 89 struct list_head list[2];
75 struct backref_node *node[2]; 90 struct backref_node *node[2];
76 u64 blockptr;
77}; 91};
78 92
79#define LOWER 0 93#define LOWER 0
@@ -82,9 +96,25 @@ struct backref_edge {
82struct backref_cache { 96struct backref_cache {
83 /* red black tree of all backref nodes in the cache */ 97 /* red black tree of all backref nodes in the cache */
84 struct rb_root rb_root; 98 struct rb_root rb_root;
85 /* list of backref nodes with no child block in the cache */ 99 /* for passing backref nodes to btrfs_reloc_cow_block */
100 struct backref_node *path[BTRFS_MAX_LEVEL];
101 /*
102 * list of blocks that have been cowed but some block
103 * pointers in upper level blocks may not reflect the
104 * new location
105 */
86 struct list_head pending[BTRFS_MAX_LEVEL]; 106 struct list_head pending[BTRFS_MAX_LEVEL];
87 spinlock_t lock; 107 /* list of backref nodes with no child node */
108 struct list_head leaves;
109 /* list of blocks that have been cowed in current transaction */
110 struct list_head changed;
111 /* list of detached backref node. */
112 struct list_head detached;
113
114 u64 last_trans;
115
116 int nr_nodes;
117 int nr_edges;
88}; 118};
89 119
90/* 120/*
@@ -112,15 +142,6 @@ struct tree_block {
112 unsigned int key_ready:1; 142 unsigned int key_ready:1;
113}; 143};
114 144
115/* inode vector */
116#define INODEVEC_SIZE 16
117
118struct inodevec {
119 struct list_head list;
120 struct inode *inode[INODEVEC_SIZE];
121 int nr;
122};
123
124#define MAX_EXTENTS 128 145#define MAX_EXTENTS 128
125 146
126struct file_extent_cluster { 147struct file_extent_cluster {
@@ -137,36 +158,43 @@ struct reloc_control {
137 struct btrfs_root *extent_root; 158 struct btrfs_root *extent_root;
138 /* inode for moving data */ 159 /* inode for moving data */
139 struct inode *data_inode; 160 struct inode *data_inode;
140 struct btrfs_workers workers; 161
162 struct btrfs_block_rsv *block_rsv;
163
164 struct backref_cache backref_cache;
165
166 struct file_extent_cluster cluster;
141 /* tree blocks have been processed */ 167 /* tree blocks have been processed */
142 struct extent_io_tree processed_blocks; 168 struct extent_io_tree processed_blocks;
143 /* map start of tree root to corresponding reloc tree */ 169 /* map start of tree root to corresponding reloc tree */
144 struct mapping_tree reloc_root_tree; 170 struct mapping_tree reloc_root_tree;
145 /* list of reloc trees */ 171 /* list of reloc trees */
146 struct list_head reloc_roots; 172 struct list_head reloc_roots;
173 /* size of metadata reservation for merging reloc trees */
174 u64 merging_rsv_size;
175 /* size of relocated tree nodes */
176 u64 nodes_relocated;
177
147 u64 search_start; 178 u64 search_start;
148 u64 extents_found; 179 u64 extents_found;
149 u64 extents_skipped; 180
150 int stage; 181 int block_rsv_retries;
151 int create_reloc_root; 182
183 unsigned int stage:8;
184 unsigned int create_reloc_tree:1;
185 unsigned int merge_reloc_tree:1;
152 unsigned int found_file_extent:1; 186 unsigned int found_file_extent:1;
153 unsigned int found_old_snapshot:1; 187 unsigned int commit_transaction:1;
154}; 188};
155 189
156/* stages of data relocation */ 190/* stages of data relocation */
157#define MOVE_DATA_EXTENTS 0 191#define MOVE_DATA_EXTENTS 0
158#define UPDATE_DATA_PTRS 1 192#define UPDATE_DATA_PTRS 1
159 193
160/* 194static void remove_backref_node(struct backref_cache *cache,
161 * merge reloc tree to corresponding fs tree in worker threads 195 struct backref_node *node);
162 */ 196static void __mark_block_processed(struct reloc_control *rc,
163struct async_merge { 197 struct backref_node *node);
164 struct btrfs_work work;
165 struct reloc_control *rc;
166 struct btrfs_root *root;
167 struct completion *done;
168 atomic_t *num_pending;
169};
170 198
171static void mapping_tree_init(struct mapping_tree *tree) 199static void mapping_tree_init(struct mapping_tree *tree)
172{ 200{
@@ -180,15 +208,80 @@ static void backref_cache_init(struct backref_cache *cache)
180 cache->rb_root = RB_ROOT; 208 cache->rb_root = RB_ROOT;
181 for (i = 0; i < BTRFS_MAX_LEVEL; i++) 209 for (i = 0; i < BTRFS_MAX_LEVEL; i++)
182 INIT_LIST_HEAD(&cache->pending[i]); 210 INIT_LIST_HEAD(&cache->pending[i]);
183 spin_lock_init(&cache->lock); 211 INIT_LIST_HEAD(&cache->changed);
212 INIT_LIST_HEAD(&cache->detached);
213 INIT_LIST_HEAD(&cache->leaves);
214}
215
216static void backref_cache_cleanup(struct backref_cache *cache)
217{
218 struct backref_node *node;
219 int i;
220
221 while (!list_empty(&cache->detached)) {
222 node = list_entry(cache->detached.next,
223 struct backref_node, list);
224 remove_backref_node(cache, node);
225 }
226
227 while (!list_empty(&cache->leaves)) {
228 node = list_entry(cache->leaves.next,
229 struct backref_node, lower);
230 remove_backref_node(cache, node);
231 }
232
233 cache->last_trans = 0;
234
235 for (i = 0; i < BTRFS_MAX_LEVEL; i++)
236 BUG_ON(!list_empty(&cache->pending[i]));
237 BUG_ON(!list_empty(&cache->changed));
238 BUG_ON(!list_empty(&cache->detached));
239 BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root));
240 BUG_ON(cache->nr_nodes);
241 BUG_ON(cache->nr_edges);
242}
243
244static struct backref_node *alloc_backref_node(struct backref_cache *cache)
245{
246 struct backref_node *node;
247
248 node = kzalloc(sizeof(*node), GFP_NOFS);
249 if (node) {
250 INIT_LIST_HEAD(&node->list);
251 INIT_LIST_HEAD(&node->upper);
252 INIT_LIST_HEAD(&node->lower);
253 RB_CLEAR_NODE(&node->rb_node);
254 cache->nr_nodes++;
255 }
256 return node;
257}
258
259static void free_backref_node(struct backref_cache *cache,
260 struct backref_node *node)
261{
262 if (node) {
263 cache->nr_nodes--;
264 kfree(node);
265 }
266}
267
268static struct backref_edge *alloc_backref_edge(struct backref_cache *cache)
269{
270 struct backref_edge *edge;
271
272 edge = kzalloc(sizeof(*edge), GFP_NOFS);
273 if (edge)
274 cache->nr_edges++;
275 return edge;
184} 276}
185 277
186static void backref_node_init(struct backref_node *node) 278static void free_backref_edge(struct backref_cache *cache,
279 struct backref_edge *edge)
187{ 280{
188 memset(node, 0, sizeof(*node)); 281 if (edge) {
189 INIT_LIST_HEAD(&node->upper); 282 cache->nr_edges--;
190 INIT_LIST_HEAD(&node->lower); 283 kfree(edge);
191 RB_CLEAR_NODE(&node->rb_node); 284 }
192} 285}
193 286
194static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr, 287static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
@@ -249,6 +342,7 @@ static struct backref_node *walk_up_backref(struct backref_node *node,
249 edges[idx++] = edge; 342 edges[idx++] = edge;
250 node = edge->node[UPPER]; 343 node = edge->node[UPPER];
251 } 344 }
345 BUG_ON(node->detached);
252 *index = idx; 346 *index = idx;
253 return node; 347 return node;
254} 348}
@@ -280,13 +374,18 @@ static struct backref_node *walk_down_backref(struct backref_edge *edges[],
280 return NULL; 374 return NULL;
281} 375}
282 376
377static void unlock_node_buffer(struct backref_node *node)
378{
379 if (node->locked) {
380 btrfs_tree_unlock(node->eb);
381 node->locked = 0;
382 }
383}
384
283static void drop_node_buffer(struct backref_node *node) 385static void drop_node_buffer(struct backref_node *node)
284{ 386{
285 if (node->eb) { 387 if (node->eb) {
286 if (node->locked) { 388 unlock_node_buffer(node);
287 btrfs_tree_unlock(node->eb);
288 node->locked = 0;
289 }
290 free_extent_buffer(node->eb); 389 free_extent_buffer(node->eb);
291 node->eb = NULL; 390 node->eb = NULL;
292 } 391 }
@@ -295,14 +394,14 @@ static void drop_node_buffer(struct backref_node *node)
295static void drop_backref_node(struct backref_cache *tree, 394static void drop_backref_node(struct backref_cache *tree,
296 struct backref_node *node) 395 struct backref_node *node)
297{ 396{
298 BUG_ON(!node->lowest);
299 BUG_ON(!list_empty(&node->upper)); 397 BUG_ON(!list_empty(&node->upper));
300 398
301 drop_node_buffer(node); 399 drop_node_buffer(node);
400 list_del(&node->list);
302 list_del(&node->lower); 401 list_del(&node->lower);
303 402 if (!RB_EMPTY_NODE(&node->rb_node))
304 rb_erase(&node->rb_node, &tree->rb_root); 403 rb_erase(&node->rb_node, &tree->rb_root);
305 kfree(node); 404 free_backref_node(tree, node);
306} 405}
307 406
308/* 407/*
@@ -317,27 +416,121 @@ static void remove_backref_node(struct backref_cache *cache,
317 if (!node) 416 if (!node)
318 return; 417 return;
319 418
320 BUG_ON(!node->lowest); 419 BUG_ON(!node->lowest && !node->detached);
321 while (!list_empty(&node->upper)) { 420 while (!list_empty(&node->upper)) {
322 edge = list_entry(node->upper.next, struct backref_edge, 421 edge = list_entry(node->upper.next, struct backref_edge,
323 list[LOWER]); 422 list[LOWER]);
324 upper = edge->node[UPPER]; 423 upper = edge->node[UPPER];
325 list_del(&edge->list[LOWER]); 424 list_del(&edge->list[LOWER]);
326 list_del(&edge->list[UPPER]); 425 list_del(&edge->list[UPPER]);
327 kfree(edge); 426 free_backref_edge(cache, edge);
427
428 if (RB_EMPTY_NODE(&upper->rb_node)) {
429 BUG_ON(!list_empty(&node->upper));
430 drop_backref_node(cache, node);
431 node = upper;
432 node->lowest = 1;
433 continue;
434 }
328 /* 435 /*
329 * add the node to pending list if no other 436 * add the node to leaf node list if no other
330 * child block cached. 437 * child block cached.
331 */ 438 */
332 if (list_empty(&upper->lower)) { 439 if (list_empty(&upper->lower)) {
333 list_add_tail(&upper->lower, 440 list_add_tail(&upper->lower, &cache->leaves);
334 &cache->pending[upper->level]);
335 upper->lowest = 1; 441 upper->lowest = 1;
336 } 442 }
337 } 443 }
444
338 drop_backref_node(cache, node); 445 drop_backref_node(cache, node);
339} 446}
340 447
448static void update_backref_node(struct backref_cache *cache,
449 struct backref_node *node, u64 bytenr)
450{
451 struct rb_node *rb_node;
452 rb_erase(&node->rb_node, &cache->rb_root);
453 node->bytenr = bytenr;
454 rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
455 BUG_ON(rb_node);
456}
457
458/*
459 * update backref cache after a transaction commit
460 */
461static int update_backref_cache(struct btrfs_trans_handle *trans,
462 struct backref_cache *cache)
463{
464 struct backref_node *node;
465 int level = 0;
466
467 if (cache->last_trans == 0) {
468 cache->last_trans = trans->transid;
469 return 0;
470 }
471
472 if (cache->last_trans == trans->transid)
473 return 0;
474
475 /*
476 * detached nodes are used to avoid unnecessary backref
477 * lookup. transaction commit changes the extent tree.
478 * so the detached nodes are no longer useful.
479 */
480 while (!list_empty(&cache->detached)) {
481 node = list_entry(cache->detached.next,
482 struct backref_node, list);
483 remove_backref_node(cache, node);
484 }
485
486 while (!list_empty(&cache->changed)) {
487 node = list_entry(cache->changed.next,
488 struct backref_node, list);
489 list_del_init(&node->list);
490 BUG_ON(node->pending);
491 update_backref_node(cache, node, node->new_bytenr);
492 }
493
494 /*
495 * some nodes can be left in the pending list if there were
496 * errors during processing the pending nodes.
497 */
498 for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
499 list_for_each_entry(node, &cache->pending[level], list) {
500 BUG_ON(!node->pending);
501 if (node->bytenr == node->new_bytenr)
502 continue;
503 update_backref_node(cache, node, node->new_bytenr);
504 }
505 }
506
507 cache->last_trans = 0;
508 return 1;
509}
510
511static int should_ignore_root(struct btrfs_root *root)
512{
513 struct btrfs_root *reloc_root;
514
515 if (!root->ref_cows)
516 return 0;
517
518 reloc_root = root->reloc_root;
519 if (!reloc_root)
520 return 0;
521
522 if (btrfs_root_last_snapshot(&reloc_root->root_item) ==
523 root->fs_info->running_transaction->transid - 1)
524 return 0;
525 /*
526 * if there is reloc tree and it was created in previous
527 * transaction backref lookup can find the reloc tree,
528 * so backref node for the fs tree root is useless for
529 * relocation.
530 */
531 return 1;
532}
533
341/* 534/*
342 * find reloc tree by address of tree root 535 * find reloc tree by address of tree root
343 */ 536 */
@@ -452,11 +645,12 @@ int find_inline_backref(struct extent_buffer *leaf, int slot,
452 * for all upper level blocks that directly/indirectly reference the 645 * for all upper level blocks that directly/indirectly reference the
453 * block are also cached. 646 * block are also cached.
454 */ 647 */
455static struct backref_node *build_backref_tree(struct reloc_control *rc, 648static noinline_for_stack
456 struct backref_cache *cache, 649struct backref_node *build_backref_tree(struct reloc_control *rc,
457 struct btrfs_key *node_key, 650 struct btrfs_key *node_key,
458 int level, u64 bytenr) 651 int level, u64 bytenr)
459{ 652{
653 struct backref_cache *cache = &rc->backref_cache;
460 struct btrfs_path *path1; 654 struct btrfs_path *path1;
461 struct btrfs_path *path2; 655 struct btrfs_path *path2;
462 struct extent_buffer *eb; 656 struct extent_buffer *eb;
@@ -472,6 +666,8 @@ static struct backref_node *build_backref_tree(struct reloc_control *rc,
472 unsigned long end; 666 unsigned long end;
473 unsigned long ptr; 667 unsigned long ptr;
474 LIST_HEAD(list); 668 LIST_HEAD(list);
669 LIST_HEAD(useless);
670 int cowonly;
475 int ret; 671 int ret;
476 int err = 0; 672 int err = 0;
477 673
@@ -482,15 +678,13 @@ static struct backref_node *build_backref_tree(struct reloc_control *rc,
482 goto out; 678 goto out;
483 } 679 }
484 680
485 node = kmalloc(sizeof(*node), GFP_NOFS); 681 node = alloc_backref_node(cache);
486 if (!node) { 682 if (!node) {
487 err = -ENOMEM; 683 err = -ENOMEM;
488 goto out; 684 goto out;
489 } 685 }
490 686
491 backref_node_init(node);
492 node->bytenr = bytenr; 687 node->bytenr = bytenr;
493 node->owner = 0;
494 node->level = level; 688 node->level = level;
495 node->lowest = 1; 689 node->lowest = 1;
496 cur = node; 690 cur = node;
@@ -586,17 +780,20 @@ again:
586#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 780#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
587 if (key.type == BTRFS_SHARED_BLOCK_REF_KEY || 781 if (key.type == BTRFS_SHARED_BLOCK_REF_KEY ||
588 key.type == BTRFS_EXTENT_REF_V0_KEY) { 782 key.type == BTRFS_EXTENT_REF_V0_KEY) {
589 if (key.objectid == key.offset && 783 if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
590 key.type == BTRFS_EXTENT_REF_V0_KEY) {
591 struct btrfs_extent_ref_v0 *ref0; 784 struct btrfs_extent_ref_v0 *ref0;
592 ref0 = btrfs_item_ptr(eb, path1->slots[0], 785 ref0 = btrfs_item_ptr(eb, path1->slots[0],
593 struct btrfs_extent_ref_v0); 786 struct btrfs_extent_ref_v0);
594 root = find_tree_root(rc, eb, ref0); 787 root = find_tree_root(rc, eb, ref0);
595 if (root) 788 if (!root->ref_cows)
596 cur->root = root; 789 cur->cowonly = 1;
597 else 790 if (key.objectid == key.offset) {
598 cur->old_root = 1; 791 if (root && !should_ignore_root(root))
599 break; 792 cur->root = root;
793 else
794 list_add(&cur->list, &useless);
795 break;
796 }
600 } 797 }
601#else 798#else
602 BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY); 799 BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
@@ -613,22 +810,20 @@ again:
613 break; 810 break;
614 } 811 }
615 812
616 edge = kzalloc(sizeof(*edge), GFP_NOFS); 813 edge = alloc_backref_edge(cache);
617 if (!edge) { 814 if (!edge) {
618 err = -ENOMEM; 815 err = -ENOMEM;
619 goto out; 816 goto out;
620 } 817 }
621 rb_node = tree_search(&cache->rb_root, key.offset); 818 rb_node = tree_search(&cache->rb_root, key.offset);
622 if (!rb_node) { 819 if (!rb_node) {
623 upper = kmalloc(sizeof(*upper), GFP_NOFS); 820 upper = alloc_backref_node(cache);
624 if (!upper) { 821 if (!upper) {
625 kfree(edge); 822 free_backref_edge(cache, edge);
626 err = -ENOMEM; 823 err = -ENOMEM;
627 goto out; 824 goto out;
628 } 825 }
629 backref_node_init(upper);
630 upper->bytenr = key.offset; 826 upper->bytenr = key.offset;
631 upper->owner = 0;
632 upper->level = cur->level + 1; 827 upper->level = cur->level + 1;
633 /* 828 /*
634 * backrefs for the upper level block isn't 829 * backrefs for the upper level block isn't
@@ -638,11 +833,12 @@ again:
638 } else { 833 } else {
639 upper = rb_entry(rb_node, struct backref_node, 834 upper = rb_entry(rb_node, struct backref_node,
640 rb_node); 835 rb_node);
836 BUG_ON(!upper->checked);
641 INIT_LIST_HEAD(&edge->list[UPPER]); 837 INIT_LIST_HEAD(&edge->list[UPPER]);
642 } 838 }
643 list_add(&edge->list[LOWER], &cur->upper); 839 list_add_tail(&edge->list[LOWER], &cur->upper);
644 edge->node[UPPER] = upper;
645 edge->node[LOWER] = cur; 840 edge->node[LOWER] = cur;
841 edge->node[UPPER] = upper;
646 842
647 goto next; 843 goto next;
648 } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) { 844 } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) {
@@ -656,11 +852,17 @@ again:
656 goto out; 852 goto out;
657 } 853 }
658 854
855 if (!root->ref_cows)
856 cur->cowonly = 1;
857
659 if (btrfs_root_level(&root->root_item) == cur->level) { 858 if (btrfs_root_level(&root->root_item) == cur->level) {
660 /* tree root */ 859 /* tree root */
661 BUG_ON(btrfs_root_bytenr(&root->root_item) != 860 BUG_ON(btrfs_root_bytenr(&root->root_item) !=
662 cur->bytenr); 861 cur->bytenr);
663 cur->root = root; 862 if (should_ignore_root(root))
863 list_add(&cur->list, &useless);
864 else
865 cur->root = root;
664 break; 866 break;
665 } 867 }
666 868
@@ -691,11 +893,14 @@ again:
691 if (!path2->nodes[level]) { 893 if (!path2->nodes[level]) {
692 BUG_ON(btrfs_root_bytenr(&root->root_item) != 894 BUG_ON(btrfs_root_bytenr(&root->root_item) !=
693 lower->bytenr); 895 lower->bytenr);
694 lower->root = root; 896 if (should_ignore_root(root))
897 list_add(&lower->list, &useless);
898 else
899 lower->root = root;
695 break; 900 break;
696 } 901 }
697 902
698 edge = kzalloc(sizeof(*edge), GFP_NOFS); 903 edge = alloc_backref_edge(cache);
699 if (!edge) { 904 if (!edge) {
700 err = -ENOMEM; 905 err = -ENOMEM;
701 goto out; 906 goto out;
@@ -704,16 +909,17 @@ again:
704 eb = path2->nodes[level]; 909 eb = path2->nodes[level];
705 rb_node = tree_search(&cache->rb_root, eb->start); 910 rb_node = tree_search(&cache->rb_root, eb->start);
706 if (!rb_node) { 911 if (!rb_node) {
707 upper = kmalloc(sizeof(*upper), GFP_NOFS); 912 upper = alloc_backref_node(cache);
708 if (!upper) { 913 if (!upper) {
709 kfree(edge); 914 free_backref_edge(cache, edge);
710 err = -ENOMEM; 915 err = -ENOMEM;
711 goto out; 916 goto out;
712 } 917 }
713 backref_node_init(upper);
714 upper->bytenr = eb->start; 918 upper->bytenr = eb->start;
715 upper->owner = btrfs_header_owner(eb); 919 upper->owner = btrfs_header_owner(eb);
716 upper->level = lower->level + 1; 920 upper->level = lower->level + 1;
921 if (!root->ref_cows)
922 upper->cowonly = 1;
717 923
718 /* 924 /*
719 * if we know the block isn't shared 925 * if we know the block isn't shared
@@ -743,10 +949,12 @@ again:
743 rb_node); 949 rb_node);
744 BUG_ON(!upper->checked); 950 BUG_ON(!upper->checked);
745 INIT_LIST_HEAD(&edge->list[UPPER]); 951 INIT_LIST_HEAD(&edge->list[UPPER]);
952 if (!upper->owner)
953 upper->owner = btrfs_header_owner(eb);
746 } 954 }
747 list_add_tail(&edge->list[LOWER], &lower->upper); 955 list_add_tail(&edge->list[LOWER], &lower->upper);
748 edge->node[UPPER] = upper;
749 edge->node[LOWER] = lower; 956 edge->node[LOWER] = lower;
957 edge->node[UPPER] = upper;
750 958
751 if (rb_node) 959 if (rb_node)
752 break; 960 break;
@@ -784,8 +992,13 @@ next:
784 * into the cache. 992 * into the cache.
785 */ 993 */
786 BUG_ON(!node->checked); 994 BUG_ON(!node->checked);
787 rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node); 995 cowonly = node->cowonly;
788 BUG_ON(rb_node); 996 if (!cowonly) {
997 rb_node = tree_insert(&cache->rb_root, node->bytenr,
998 &node->rb_node);
999 BUG_ON(rb_node);
1000 list_add_tail(&node->lower, &cache->leaves);
1001 }
789 1002
790 list_for_each_entry(edge, &node->upper, list[LOWER]) 1003 list_for_each_entry(edge, &node->upper, list[LOWER])
791 list_add_tail(&edge->list[UPPER], &list); 1004 list_add_tail(&edge->list[UPPER], &list);
@@ -794,6 +1007,14 @@ next:
794 edge = list_entry(list.next, struct backref_edge, list[UPPER]); 1007 edge = list_entry(list.next, struct backref_edge, list[UPPER]);
795 list_del_init(&edge->list[UPPER]); 1008 list_del_init(&edge->list[UPPER]);
796 upper = edge->node[UPPER]; 1009 upper = edge->node[UPPER];
1010 if (upper->detached) {
1011 list_del(&edge->list[LOWER]);
1012 lower = edge->node[LOWER];
1013 free_backref_edge(cache, edge);
1014 if (list_empty(&lower->upper))
1015 list_add(&lower->list, &useless);
1016 continue;
1017 }
797 1018
798 if (!RB_EMPTY_NODE(&upper->rb_node)) { 1019 if (!RB_EMPTY_NODE(&upper->rb_node)) {
799 if (upper->lowest) { 1020 if (upper->lowest) {
@@ -806,25 +1027,69 @@ next:
806 } 1027 }
807 1028
808 BUG_ON(!upper->checked); 1029 BUG_ON(!upper->checked);
809 rb_node = tree_insert(&cache->rb_root, upper->bytenr, 1030 BUG_ON(cowonly != upper->cowonly);
810 &upper->rb_node); 1031 if (!cowonly) {
811 BUG_ON(rb_node); 1032 rb_node = tree_insert(&cache->rb_root, upper->bytenr,
1033 &upper->rb_node);
1034 BUG_ON(rb_node);
1035 }
812 1036
813 list_add_tail(&edge->list[UPPER], &upper->lower); 1037 list_add_tail(&edge->list[UPPER], &upper->lower);
814 1038
815 list_for_each_entry(edge, &upper->upper, list[LOWER]) 1039 list_for_each_entry(edge, &upper->upper, list[LOWER])
816 list_add_tail(&edge->list[UPPER], &list); 1040 list_add_tail(&edge->list[UPPER], &list);
817 } 1041 }
1042 /*
1043 * process useless backref nodes. backref nodes for tree leaves
1044 * are deleted from the cache. backref nodes for upper level
1045 * tree blocks are left in the cache to avoid unnecessary backref
1046 * lookup.
1047 */
1048 while (!list_empty(&useless)) {
1049 upper = list_entry(useless.next, struct backref_node, list);
1050 list_del_init(&upper->list);
1051 BUG_ON(!list_empty(&upper->upper));
1052 if (upper == node)
1053 node = NULL;
1054 if (upper->lowest) {
1055 list_del_init(&upper->lower);
1056 upper->lowest = 0;
1057 }
1058 while (!list_empty(&upper->lower)) {
1059 edge = list_entry(upper->lower.next,
1060 struct backref_edge, list[UPPER]);
1061 list_del(&edge->list[UPPER]);
1062 list_del(&edge->list[LOWER]);
1063 lower = edge->node[LOWER];
1064 free_backref_edge(cache, edge);
1065
1066 if (list_empty(&lower->upper))
1067 list_add(&lower->list, &useless);
1068 }
1069 __mark_block_processed(rc, upper);
1070 if (upper->level > 0) {
1071 list_add(&upper->list, &cache->detached);
1072 upper->detached = 1;
1073 } else {
1074 rb_erase(&upper->rb_node, &cache->rb_root);
1075 free_backref_node(cache, upper);
1076 }
1077 }
818out: 1078out:
819 btrfs_free_path(path1); 1079 btrfs_free_path(path1);
820 btrfs_free_path(path2); 1080 btrfs_free_path(path2);
821 if (err) { 1081 if (err) {
822 INIT_LIST_HEAD(&list); 1082 while (!list_empty(&useless)) {
1083 lower = list_entry(useless.next,
1084 struct backref_node, upper);
1085 list_del_init(&lower->upper);
1086 }
823 upper = node; 1087 upper = node;
1088 INIT_LIST_HEAD(&list);
824 while (upper) { 1089 while (upper) {
825 if (RB_EMPTY_NODE(&upper->rb_node)) { 1090 if (RB_EMPTY_NODE(&upper->rb_node)) {
826 list_splice_tail(&upper->upper, &list); 1091 list_splice_tail(&upper->upper, &list);
827 kfree(upper); 1092 free_backref_node(cache, upper);
828 } 1093 }
829 1094
830 if (list_empty(&list)) 1095 if (list_empty(&list))
@@ -832,15 +1097,104 @@ out:
832 1097
833 edge = list_entry(list.next, struct backref_edge, 1098 edge = list_entry(list.next, struct backref_edge,
834 list[LOWER]); 1099 list[LOWER]);
1100 list_del(&edge->list[LOWER]);
835 upper = edge->node[UPPER]; 1101 upper = edge->node[UPPER];
836 kfree(edge); 1102 free_backref_edge(cache, edge);
837 } 1103 }
838 return ERR_PTR(err); 1104 return ERR_PTR(err);
839 } 1105 }
1106 BUG_ON(node && node->detached);
840 return node; 1107 return node;
841} 1108}
842 1109
843/* 1110/*
1111 * helper to add backref node for the newly created snapshot.
1112 * the backref node is created by cloning backref node that
1113 * corresponds to root of source tree
1114 */
1115static int clone_backref_node(struct btrfs_trans_handle *trans,
1116 struct reloc_control *rc,
1117 struct btrfs_root *src,
1118 struct btrfs_root *dest)
1119{
1120 struct btrfs_root *reloc_root = src->reloc_root;
1121 struct backref_cache *cache = &rc->backref_cache;
1122 struct backref_node *node = NULL;
1123 struct backref_node *new_node;
1124 struct backref_edge *edge;
1125 struct backref_edge *new_edge;
1126 struct rb_node *rb_node;
1127
1128 if (cache->last_trans > 0)
1129 update_backref_cache(trans, cache);
1130
1131 rb_node = tree_search(&cache->rb_root, src->commit_root->start);
1132 if (rb_node) {
1133 node = rb_entry(rb_node, struct backref_node, rb_node);
1134 if (node->detached)
1135 node = NULL;
1136 else
1137 BUG_ON(node->new_bytenr != reloc_root->node->start);
1138 }
1139
1140 if (!node) {
1141 rb_node = tree_search(&cache->rb_root,
1142 reloc_root->commit_root->start);
1143 if (rb_node) {
1144 node = rb_entry(rb_node, struct backref_node,
1145 rb_node);
1146 BUG_ON(node->detached);
1147 }
1148 }
1149
1150 if (!node)
1151 return 0;
1152
1153 new_node = alloc_backref_node(cache);
1154 if (!new_node)
1155 return -ENOMEM;
1156
1157 new_node->bytenr = dest->node->start;
1158 new_node->level = node->level;
1159 new_node->lowest = node->lowest;
1160 new_node->root = dest;
1161
1162 if (!node->lowest) {
1163 list_for_each_entry(edge, &node->lower, list[UPPER]) {
1164 new_edge = alloc_backref_edge(cache);
1165 if (!new_edge)
1166 goto fail;
1167
1168 new_edge->node[UPPER] = new_node;
1169 new_edge->node[LOWER] = edge->node[LOWER];
1170 list_add_tail(&new_edge->list[UPPER],
1171 &new_node->lower);
1172 }
1173 }
1174
1175 rb_node = tree_insert(&cache->rb_root, new_node->bytenr,
1176 &new_node->rb_node);
1177 BUG_ON(rb_node);
1178
1179 if (!new_node->lowest) {
1180 list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) {
1181 list_add_tail(&new_edge->list[LOWER],
1182 &new_edge->node[LOWER]->upper);
1183 }
1184 }
1185 return 0;
1186fail:
1187 while (!list_empty(&new_node->lower)) {
1188 new_edge = list_entry(new_node->lower.next,
1189 struct backref_edge, list[UPPER]);
1190 list_del(&new_edge->list[UPPER]);
1191 free_backref_edge(cache, new_edge);
1192 }
1193 free_backref_node(cache, new_node);
1194 return -ENOMEM;
1195}
1196
1197/*
844 * helper to add 'address of tree root -> reloc tree' mapping 1198 * helper to add 'address of tree root -> reloc tree' mapping
845 */ 1199 */
846static int __add_reloc_root(struct btrfs_root *root) 1200static int __add_reloc_root(struct btrfs_root *root)
@@ -900,12 +1254,8 @@ static int __update_reloc_root(struct btrfs_root *root, int del)
900 return 0; 1254 return 0;
901} 1255}
902 1256
903/* 1257static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
904 * create reloc tree for a given fs tree. reloc tree is just a 1258 struct btrfs_root *root, u64 objectid)
905 * snapshot of the fs tree with special root objectid.
906 */
907int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
908 struct btrfs_root *root)
909{ 1259{
910 struct btrfs_root *reloc_root; 1260 struct btrfs_root *reloc_root;
911 struct extent_buffer *eb; 1261 struct extent_buffer *eb;
@@ -913,36 +1263,45 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
913 struct btrfs_key root_key; 1263 struct btrfs_key root_key;
914 int ret; 1264 int ret;
915 1265
916 if (root->reloc_root) {
917 reloc_root = root->reloc_root;
918 reloc_root->last_trans = trans->transid;
919 return 0;
920 }
921
922 if (!root->fs_info->reloc_ctl ||
923 !root->fs_info->reloc_ctl->create_reloc_root ||
924 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
925 return 0;
926
927 root_item = kmalloc(sizeof(*root_item), GFP_NOFS); 1266 root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
928 BUG_ON(!root_item); 1267 BUG_ON(!root_item);
929 1268
930 root_key.objectid = BTRFS_TREE_RELOC_OBJECTID; 1269 root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
931 root_key.type = BTRFS_ROOT_ITEM_KEY; 1270 root_key.type = BTRFS_ROOT_ITEM_KEY;
932 root_key.offset = root->root_key.objectid; 1271 root_key.offset = objectid;
933 1272
934 ret = btrfs_copy_root(trans, root, root->commit_root, &eb, 1273 if (root->root_key.objectid == objectid) {
935 BTRFS_TREE_RELOC_OBJECTID); 1274 /* called by btrfs_init_reloc_root */
936 BUG_ON(ret); 1275 ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
1276 BTRFS_TREE_RELOC_OBJECTID);
1277 BUG_ON(ret);
1278
1279 btrfs_set_root_last_snapshot(&root->root_item,
1280 trans->transid - 1);
1281 } else {
1282 /*
1283 * called by btrfs_reloc_post_snapshot_hook.
1284 * the source tree is a reloc tree, all tree blocks
1285 * modified after it was created have RELOC flag
1286 * set in their headers. so it's OK to not update
1287 * the 'last_snapshot'.
1288 */
1289 ret = btrfs_copy_root(trans, root, root->node, &eb,
1290 BTRFS_TREE_RELOC_OBJECTID);
1291 BUG_ON(ret);
1292 }
937 1293
938 btrfs_set_root_last_snapshot(&root->root_item, trans->transid - 1);
939 memcpy(root_item, &root->root_item, sizeof(*root_item)); 1294 memcpy(root_item, &root->root_item, sizeof(*root_item));
940 btrfs_set_root_refs(root_item, 1);
941 btrfs_set_root_bytenr(root_item, eb->start); 1295 btrfs_set_root_bytenr(root_item, eb->start);
942 btrfs_set_root_level(root_item, btrfs_header_level(eb)); 1296 btrfs_set_root_level(root_item, btrfs_header_level(eb));
943 btrfs_set_root_generation(root_item, trans->transid); 1297 btrfs_set_root_generation(root_item, trans->transid);
944 memset(&root_item->drop_progress, 0, sizeof(struct btrfs_disk_key)); 1298
945 root_item->drop_level = 0; 1299 if (root->root_key.objectid == objectid) {
1300 btrfs_set_root_refs(root_item, 0);
1301 memset(&root_item->drop_progress, 0,
1302 sizeof(struct btrfs_disk_key));
1303 root_item->drop_level = 0;
1304 }
946 1305
947 btrfs_tree_unlock(eb); 1306 btrfs_tree_unlock(eb);
948 free_extent_buffer(eb); 1307 free_extent_buffer(eb);
@@ -956,6 +1315,37 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
956 &root_key); 1315 &root_key);
957 BUG_ON(IS_ERR(reloc_root)); 1316 BUG_ON(IS_ERR(reloc_root));
958 reloc_root->last_trans = trans->transid; 1317 reloc_root->last_trans = trans->transid;
1318 return reloc_root;
1319}
1320
1321/*
1322 * create reloc tree for a given fs tree. reloc tree is just a
1323 * snapshot of the fs tree with special root objectid.
1324 */
1325int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
1326 struct btrfs_root *root)
1327{
1328 struct btrfs_root *reloc_root;
1329 struct reloc_control *rc = root->fs_info->reloc_ctl;
1330 int clear_rsv = 0;
1331
1332 if (root->reloc_root) {
1333 reloc_root = root->reloc_root;
1334 reloc_root->last_trans = trans->transid;
1335 return 0;
1336 }
1337
1338 if (!rc || !rc->create_reloc_tree ||
1339 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1340 return 0;
1341
1342 if (!trans->block_rsv) {
1343 trans->block_rsv = rc->block_rsv;
1344 clear_rsv = 1;
1345 }
1346 reloc_root = create_reloc_root(trans, root, root->root_key.objectid);
1347 if (clear_rsv)
1348 trans->block_rsv = NULL;
959 1349
960 __add_reloc_root(reloc_root); 1350 __add_reloc_root(reloc_root);
961 root->reloc_root = reloc_root; 1351 root->reloc_root = reloc_root;
@@ -979,7 +1369,8 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
979 reloc_root = root->reloc_root; 1369 reloc_root = root->reloc_root;
980 root_item = &reloc_root->root_item; 1370 root_item = &reloc_root->root_item;
981 1371
982 if (btrfs_root_refs(root_item) == 0) { 1372 if (root->fs_info->reloc_ctl->merge_reloc_tree &&
1373 btrfs_root_refs(root_item) == 0) {
983 root->reloc_root = NULL; 1374 root->reloc_root = NULL;
984 del = 1; 1375 del = 1;
985 } 1376 }
@@ -1101,8 +1492,7 @@ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
1101 goto out; 1492 goto out;
1102 } 1493 }
1103 1494
1104 if (new_bytenr) 1495 *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1105 *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1106 ret = 0; 1496 ret = 0;
1107out: 1497out:
1108 btrfs_free_path(path); 1498 btrfs_free_path(path);
@@ -1113,19 +1503,18 @@ out:
1113 * update file extent items in the tree leaf to point to 1503 * update file extent items in the tree leaf to point to
1114 * the new locations. 1504 * the new locations.
1115 */ 1505 */
1116static int replace_file_extents(struct btrfs_trans_handle *trans, 1506static noinline_for_stack
1117 struct reloc_control *rc, 1507int replace_file_extents(struct btrfs_trans_handle *trans,
1118 struct btrfs_root *root, 1508 struct reloc_control *rc,
1119 struct extent_buffer *leaf, 1509 struct btrfs_root *root,
1120 struct list_head *inode_list) 1510 struct extent_buffer *leaf)
1121{ 1511{
1122 struct btrfs_key key; 1512 struct btrfs_key key;
1123 struct btrfs_file_extent_item *fi; 1513 struct btrfs_file_extent_item *fi;
1124 struct inode *inode = NULL; 1514 struct inode *inode = NULL;
1125 struct inodevec *ivec = NULL;
1126 u64 parent; 1515 u64 parent;
1127 u64 bytenr; 1516 u64 bytenr;
1128 u64 new_bytenr; 1517 u64 new_bytenr = 0;
1129 u64 num_bytes; 1518 u64 num_bytes;
1130 u64 end; 1519 u64 end;
1131 u32 nritems; 1520 u32 nritems;
@@ -1165,21 +1554,12 @@ static int replace_file_extents(struct btrfs_trans_handle *trans,
1165 * to complete and drop the extent cache 1554 * to complete and drop the extent cache
1166 */ 1555 */
1167 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { 1556 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
1168 if (!ivec || ivec->nr == INODEVEC_SIZE) {
1169 ivec = kmalloc(sizeof(*ivec), GFP_NOFS);
1170 BUG_ON(!ivec);
1171 ivec->nr = 0;
1172 list_add_tail(&ivec->list, inode_list);
1173 }
1174 if (first) { 1557 if (first) {
1175 inode = find_next_inode(root, key.objectid); 1558 inode = find_next_inode(root, key.objectid);
1176 if (inode)
1177 ivec->inode[ivec->nr++] = inode;
1178 first = 0; 1559 first = 0;
1179 } else if (inode && inode->i_ino < key.objectid) { 1560 } else if (inode && inode->i_ino < key.objectid) {
1561 btrfs_add_delayed_iput(inode);
1180 inode = find_next_inode(root, key.objectid); 1562 inode = find_next_inode(root, key.objectid);
1181 if (inode)
1182 ivec->inode[ivec->nr++] = inode;
1183 } 1563 }
1184 if (inode && inode->i_ino == key.objectid) { 1564 if (inode && inode->i_ino == key.objectid) {
1185 end = key.offset + 1565 end = key.offset +
@@ -1203,8 +1583,10 @@ static int replace_file_extents(struct btrfs_trans_handle *trans,
1203 1583
1204 ret = get_new_location(rc->data_inode, &new_bytenr, 1584 ret = get_new_location(rc->data_inode, &new_bytenr,
1205 bytenr, num_bytes); 1585 bytenr, num_bytes);
1206 if (ret > 0) 1586 if (ret > 0) {
1587 WARN_ON(1);
1207 continue; 1588 continue;
1589 }
1208 BUG_ON(ret < 0); 1590 BUG_ON(ret < 0);
1209 1591
1210 btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr); 1592 btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
@@ -1224,6 +1606,8 @@ static int replace_file_extents(struct btrfs_trans_handle *trans,
1224 } 1606 }
1225 if (dirty) 1607 if (dirty)
1226 btrfs_mark_buffer_dirty(leaf); 1608 btrfs_mark_buffer_dirty(leaf);
1609 if (inode)
1610 btrfs_add_delayed_iput(inode);
1227 return 0; 1611 return 0;
1228} 1612}
1229 1613
@@ -1247,11 +1631,11 @@ int memcmp_node_keys(struct extent_buffer *eb, int slot,
1247 * if no block got replaced, 0 is returned. if there are other 1631 * if no block got replaced, 0 is returned. if there are other
1248 * errors, a negative error number is returned. 1632 * errors, a negative error number is returned.
1249 */ 1633 */
1250static int replace_path(struct btrfs_trans_handle *trans, 1634static noinline_for_stack
1251 struct btrfs_root *dest, struct btrfs_root *src, 1635int replace_path(struct btrfs_trans_handle *trans,
1252 struct btrfs_path *path, struct btrfs_key *next_key, 1636 struct btrfs_root *dest, struct btrfs_root *src,
1253 struct extent_buffer **leaf, 1637 struct btrfs_path *path, struct btrfs_key *next_key,
1254 int lowest_level, int max_level) 1638 int lowest_level, int max_level)
1255{ 1639{
1256 struct extent_buffer *eb; 1640 struct extent_buffer *eb;
1257 struct extent_buffer *parent; 1641 struct extent_buffer *parent;
@@ -1262,16 +1646,16 @@ static int replace_path(struct btrfs_trans_handle *trans,
1262 u64 new_ptr_gen; 1646 u64 new_ptr_gen;
1263 u64 last_snapshot; 1647 u64 last_snapshot;
1264 u32 blocksize; 1648 u32 blocksize;
1649 int cow = 0;
1265 int level; 1650 int level;
1266 int ret; 1651 int ret;
1267 int slot; 1652 int slot;
1268 1653
1269 BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); 1654 BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
1270 BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID); 1655 BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
1271 BUG_ON(lowest_level > 1 && leaf);
1272 1656
1273 last_snapshot = btrfs_root_last_snapshot(&src->root_item); 1657 last_snapshot = btrfs_root_last_snapshot(&src->root_item);
1274 1658again:
1275 slot = path->slots[lowest_level]; 1659 slot = path->slots[lowest_level];
1276 btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot); 1660 btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
1277 1661
@@ -1285,8 +1669,10 @@ static int replace_path(struct btrfs_trans_handle *trans,
1285 return 0; 1669 return 0;
1286 } 1670 }
1287 1671
1288 ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb); 1672 if (cow) {
1289 BUG_ON(ret); 1673 ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
1674 BUG_ON(ret);
1675 }
1290 btrfs_set_lock_blocking(eb); 1676 btrfs_set_lock_blocking(eb);
1291 1677
1292 if (next_key) { 1678 if (next_key) {
@@ -1330,7 +1716,7 @@ static int replace_path(struct btrfs_trans_handle *trans,
1330 1716
1331 if (new_bytenr == 0 || old_ptr_gen > last_snapshot || 1717 if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
1332 memcmp_node_keys(parent, slot, path, level)) { 1718 memcmp_node_keys(parent, slot, path, level)) {
1333 if (level <= lowest_level && !leaf) { 1719 if (level <= lowest_level) {
1334 ret = 0; 1720 ret = 0;
1335 break; 1721 break;
1336 } 1722 }
@@ -1338,16 +1724,12 @@ static int replace_path(struct btrfs_trans_handle *trans,
1338 eb = read_tree_block(dest, old_bytenr, blocksize, 1724 eb = read_tree_block(dest, old_bytenr, blocksize,
1339 old_ptr_gen); 1725 old_ptr_gen);
1340 btrfs_tree_lock(eb); 1726 btrfs_tree_lock(eb);
1341 ret = btrfs_cow_block(trans, dest, eb, parent, 1727 if (cow) {
1342 slot, &eb); 1728 ret = btrfs_cow_block(trans, dest, eb, parent,
1343 BUG_ON(ret); 1729 slot, &eb);
1344 btrfs_set_lock_blocking(eb); 1730 BUG_ON(ret);
1345
1346 if (level <= lowest_level) {
1347 *leaf = eb;
1348 ret = 0;
1349 break;
1350 } 1731 }
1732 btrfs_set_lock_blocking(eb);
1351 1733
1352 btrfs_tree_unlock(parent); 1734 btrfs_tree_unlock(parent);
1353 free_extent_buffer(parent); 1735 free_extent_buffer(parent);
@@ -1356,6 +1738,13 @@ static int replace_path(struct btrfs_trans_handle *trans,
1356 continue; 1738 continue;
1357 } 1739 }
1358 1740
1741 if (!cow) {
1742 btrfs_tree_unlock(parent);
1743 free_extent_buffer(parent);
1744 cow = 1;
1745 goto again;
1746 }
1747
1359 btrfs_node_key_to_cpu(path->nodes[level], &key, 1748 btrfs_node_key_to_cpu(path->nodes[level], &key,
1360 path->slots[level]); 1749 path->slots[level]);
1361 btrfs_release_path(src, path); 1750 btrfs_release_path(src, path);
@@ -1561,20 +1950,6 @@ static int invalidate_extent_cache(struct btrfs_root *root,
1561 return 0; 1950 return 0;
1562} 1951}
1563 1952
1564static void put_inodes(struct list_head *list)
1565{
1566 struct inodevec *ivec;
1567 while (!list_empty(list)) {
1568 ivec = list_entry(list->next, struct inodevec, list);
1569 list_del(&ivec->list);
1570 while (ivec->nr > 0) {
1571 ivec->nr--;
1572 iput(ivec->inode[ivec->nr]);
1573 }
1574 kfree(ivec);
1575 }
1576}
1577
1578static int find_next_key(struct btrfs_path *path, int level, 1953static int find_next_key(struct btrfs_path *path, int level,
1579 struct btrfs_key *key) 1954 struct btrfs_key *key)
1580 1955
@@ -1607,13 +1982,14 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1607 struct btrfs_root *reloc_root; 1982 struct btrfs_root *reloc_root;
1608 struct btrfs_root_item *root_item; 1983 struct btrfs_root_item *root_item;
1609 struct btrfs_path *path; 1984 struct btrfs_path *path;
1610 struct extent_buffer *leaf = NULL; 1985 struct extent_buffer *leaf;
1611 unsigned long nr; 1986 unsigned long nr;
1612 int level; 1987 int level;
1613 int max_level; 1988 int max_level;
1614 int replaced = 0; 1989 int replaced = 0;
1615 int ret; 1990 int ret;
1616 int err = 0; 1991 int err = 0;
1992 u32 min_reserved;
1617 1993
1618 path = btrfs_alloc_path(); 1994 path = btrfs_alloc_path();
1619 if (!path) 1995 if (!path)
@@ -1647,34 +2023,23 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1647 btrfs_unlock_up_safe(path, 0); 2023 btrfs_unlock_up_safe(path, 0);
1648 } 2024 }
1649 2025
1650 if (level == 0 && rc->stage == UPDATE_DATA_PTRS) { 2026 min_reserved = root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
1651 trans = btrfs_start_transaction(root, 1); 2027 memset(&next_key, 0, sizeof(next_key));
1652 2028
1653 leaf = path->nodes[0]; 2029 while (1) {
1654 btrfs_item_key_to_cpu(leaf, &key, 0); 2030 trans = btrfs_start_transaction(root, 0);
1655 btrfs_release_path(reloc_root, path); 2031 trans->block_rsv = rc->block_rsv;
1656 2032
1657 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2033 ret = btrfs_block_rsv_check(trans, root, rc->block_rsv,
1658 if (ret < 0) { 2034 min_reserved, 0);
1659 err = ret; 2035 if (ret) {
1660 goto out; 2036 BUG_ON(ret != -EAGAIN);
2037 ret = btrfs_commit_transaction(trans, root);
2038 BUG_ON(ret);
2039 continue;
1661 } 2040 }
1662 2041
1663 leaf = path->nodes[0];
1664 btrfs_unlock_up_safe(path, 1);
1665 ret = replace_file_extents(trans, rc, root, leaf,
1666 &inode_list);
1667 if (ret < 0)
1668 err = ret;
1669 goto out;
1670 }
1671
1672 memset(&next_key, 0, sizeof(next_key));
1673
1674 while (1) {
1675 leaf = NULL;
1676 replaced = 0; 2042 replaced = 0;
1677 trans = btrfs_start_transaction(root, 1);
1678 max_level = level; 2043 max_level = level;
1679 2044
1680 ret = walk_down_reloc_tree(reloc_root, path, &level); 2045 ret = walk_down_reloc_tree(reloc_root, path, &level);
@@ -1688,14 +2053,9 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1688 if (!find_next_key(path, level, &key) && 2053 if (!find_next_key(path, level, &key) &&
1689 btrfs_comp_cpu_keys(&next_key, &key) >= 0) { 2054 btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
1690 ret = 0; 2055 ret = 0;
1691 } else if (level == 1 && rc->stage == UPDATE_DATA_PTRS) {
1692 ret = replace_path(trans, root, reloc_root,
1693 path, &next_key, &leaf,
1694 level, max_level);
1695 } else { 2056 } else {
1696 ret = replace_path(trans, root, reloc_root, 2057 ret = replace_path(trans, root, reloc_root, path,
1697 path, &next_key, NULL, 2058 &next_key, level, max_level);
1698 level, max_level);
1699 } 2059 }
1700 if (ret < 0) { 2060 if (ret < 0) {
1701 err = ret; 2061 err = ret;
@@ -1707,16 +2067,6 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1707 btrfs_node_key_to_cpu(path->nodes[level], &key, 2067 btrfs_node_key_to_cpu(path->nodes[level], &key,
1708 path->slots[level]); 2068 path->slots[level]);
1709 replaced = 1; 2069 replaced = 1;
1710 } else if (leaf) {
1711 /*
1712 * no block got replaced, try replacing file extents
1713 */
1714 btrfs_item_key_to_cpu(leaf, &key, 0);
1715 ret = replace_file_extents(trans, rc, root, leaf,
1716 &inode_list);
1717 btrfs_tree_unlock(leaf);
1718 free_extent_buffer(leaf);
1719 BUG_ON(ret < 0);
1720 } 2070 }
1721 2071
1722 ret = walk_up_reloc_tree(reloc_root, path, &level); 2072 ret = walk_up_reloc_tree(reloc_root, path, &level);
@@ -1733,15 +2083,10 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1733 root_item->drop_level = level; 2083 root_item->drop_level = level;
1734 2084
1735 nr = trans->blocks_used; 2085 nr = trans->blocks_used;
1736 btrfs_end_transaction(trans, root); 2086 btrfs_end_transaction_throttle(trans, root);
1737 2087
1738 btrfs_btree_balance_dirty(root, nr); 2088 btrfs_btree_balance_dirty(root, nr);
1739 2089
1740 /*
1741 * put inodes outside transaction, otherwise we may deadlock.
1742 */
1743 put_inodes(&inode_list);
1744
1745 if (replaced && rc->stage == UPDATE_DATA_PTRS) 2090 if (replaced && rc->stage == UPDATE_DATA_PTRS)
1746 invalidate_extent_cache(root, &key, &next_key); 2091 invalidate_extent_cache(root, &key, &next_key);
1747 } 2092 }
@@ -1764,87 +2109,125 @@ out:
1764 sizeof(root_item->drop_progress)); 2109 sizeof(root_item->drop_progress));
1765 root_item->drop_level = 0; 2110 root_item->drop_level = 0;
1766 btrfs_set_root_refs(root_item, 0); 2111 btrfs_set_root_refs(root_item, 0);
2112 btrfs_update_reloc_root(trans, root);
1767 } 2113 }
1768 2114
1769 nr = trans->blocks_used; 2115 nr = trans->blocks_used;
1770 btrfs_end_transaction(trans, root); 2116 btrfs_end_transaction_throttle(trans, root);
1771 2117
1772 btrfs_btree_balance_dirty(root, nr); 2118 btrfs_btree_balance_dirty(root, nr);
1773 2119
1774 put_inodes(&inode_list);
1775
1776 if (replaced && rc->stage == UPDATE_DATA_PTRS) 2120 if (replaced && rc->stage == UPDATE_DATA_PTRS)
1777 invalidate_extent_cache(root, &key, &next_key); 2121 invalidate_extent_cache(root, &key, &next_key);
1778 2122
1779 return err; 2123 return err;
1780} 2124}
1781 2125
1782/* 2126static noinline_for_stack
1783 * callback for the work threads. 2127int prepare_to_merge(struct reloc_control *rc, int err)
1784 * this function merges reloc tree with corresponding fs tree,
1785 * and then drops the reloc tree.
1786 */
1787static void merge_func(struct btrfs_work *work)
1788{ 2128{
1789 struct btrfs_trans_handle *trans; 2129 struct btrfs_root *root = rc->extent_root;
1790 struct btrfs_root *root;
1791 struct btrfs_root *reloc_root; 2130 struct btrfs_root *reloc_root;
1792 struct async_merge *async; 2131 struct btrfs_trans_handle *trans;
2132 LIST_HEAD(reloc_roots);
2133 u64 num_bytes = 0;
2134 int ret;
2135 int retries = 0;
2136
2137 mutex_lock(&root->fs_info->trans_mutex);
2138 rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
2139 rc->merging_rsv_size += rc->nodes_relocated * 2;
2140 mutex_unlock(&root->fs_info->trans_mutex);
2141again:
2142 if (!err) {
2143 num_bytes = rc->merging_rsv_size;
2144 ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv,
2145 num_bytes, &retries);
2146 if (ret)
2147 err = ret;
2148 }
2149
2150 trans = btrfs_join_transaction(rc->extent_root, 1);
2151
2152 if (!err) {
2153 if (num_bytes != rc->merging_rsv_size) {
2154 btrfs_end_transaction(trans, rc->extent_root);
2155 btrfs_block_rsv_release(rc->extent_root,
2156 rc->block_rsv, num_bytes);
2157 retries = 0;
2158 goto again;
2159 }
2160 }
1793 2161
1794 async = container_of(work, struct async_merge, work); 2162 rc->merge_reloc_tree = 1;
1795 reloc_root = async->root; 2163
2164 while (!list_empty(&rc->reloc_roots)) {
2165 reloc_root = list_entry(rc->reloc_roots.next,
2166 struct btrfs_root, root_list);
2167 list_del_init(&reloc_root->root_list);
1796 2168
1797 if (btrfs_root_refs(&reloc_root->root_item) > 0) {
1798 root = read_fs_root(reloc_root->fs_info, 2169 root = read_fs_root(reloc_root->fs_info,
1799 reloc_root->root_key.offset); 2170 reloc_root->root_key.offset);
1800 BUG_ON(IS_ERR(root)); 2171 BUG_ON(IS_ERR(root));
1801 BUG_ON(root->reloc_root != reloc_root); 2172 BUG_ON(root->reloc_root != reloc_root);
1802 2173
1803 merge_reloc_root(async->rc, root); 2174 /*
1804 2175 * set reference count to 1, so btrfs_recover_relocation
1805 trans = btrfs_start_transaction(root, 1); 2176 * knows it should resumes merging
2177 */
2178 if (!err)
2179 btrfs_set_root_refs(&reloc_root->root_item, 1);
1806 btrfs_update_reloc_root(trans, root); 2180 btrfs_update_reloc_root(trans, root);
1807 btrfs_end_transaction(trans, root);
1808 }
1809 2181
1810 btrfs_drop_snapshot(reloc_root, 0); 2182 list_add(&reloc_root->root_list, &reloc_roots);
2183 }
1811 2184
1812 if (atomic_dec_and_test(async->num_pending)) 2185 list_splice(&reloc_roots, &rc->reloc_roots);
1813 complete(async->done);
1814 2186
1815 kfree(async); 2187 if (!err)
2188 btrfs_commit_transaction(trans, rc->extent_root);
2189 else
2190 btrfs_end_transaction(trans, rc->extent_root);
2191 return err;
1816} 2192}
1817 2193
1818static int merge_reloc_roots(struct reloc_control *rc) 2194static noinline_for_stack
2195int merge_reloc_roots(struct reloc_control *rc)
1819{ 2196{
1820 struct async_merge *async;
1821 struct btrfs_root *root; 2197 struct btrfs_root *root;
1822 struct completion done; 2198 struct btrfs_root *reloc_root;
1823 atomic_t num_pending; 2199 LIST_HEAD(reloc_roots);
2200 int found = 0;
2201 int ret;
2202again:
2203 root = rc->extent_root;
2204 mutex_lock(&root->fs_info->trans_mutex);
2205 list_splice_init(&rc->reloc_roots, &reloc_roots);
2206 mutex_unlock(&root->fs_info->trans_mutex);
1824 2207
1825 init_completion(&done); 2208 while (!list_empty(&reloc_roots)) {
1826 atomic_set(&num_pending, 1); 2209 found = 1;
2210 reloc_root = list_entry(reloc_roots.next,
2211 struct btrfs_root, root_list);
1827 2212
1828 while (!list_empty(&rc->reloc_roots)) { 2213 if (btrfs_root_refs(&reloc_root->root_item) > 0) {
1829 root = list_entry(rc->reloc_roots.next, 2214 root = read_fs_root(reloc_root->fs_info,
1830 struct btrfs_root, root_list); 2215 reloc_root->root_key.offset);
1831 list_del_init(&root->root_list); 2216 BUG_ON(IS_ERR(root));
2217 BUG_ON(root->reloc_root != reloc_root);
1832 2218
1833 async = kmalloc(sizeof(*async), GFP_NOFS); 2219 ret = merge_reloc_root(rc, root);
1834 BUG_ON(!async); 2220 BUG_ON(ret);
1835 async->work.func = merge_func; 2221 } else {
1836 async->work.flags = 0; 2222 list_del_init(&reloc_root->root_list);
1837 async->rc = rc; 2223 }
1838 async->root = root; 2224 btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0);
1839 async->done = &done;
1840 async->num_pending = &num_pending;
1841 atomic_inc(&num_pending);
1842 btrfs_queue_worker(&rc->workers, &async->work);
1843 } 2225 }
1844 2226
1845 if (!atomic_dec_and_test(&num_pending)) 2227 if (found) {
1846 wait_for_completion(&done); 2228 found = 0;
1847 2229 goto again;
2230 }
1848 BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); 2231 BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
1849 return 0; 2232 return 0;
1850} 2233}
@@ -1875,119 +2258,169 @@ static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
1875 return btrfs_record_root_in_trans(trans, root); 2258 return btrfs_record_root_in_trans(trans, root);
1876} 2259}
1877 2260
1878/* 2261static noinline_for_stack
1879 * select one tree from trees that references the block. 2262struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
1880 * for blocks in refernce counted trees, we preper reloc tree. 2263 struct reloc_control *rc,
1881 * if no reloc tree found and reloc_only is true, NULL is returned. 2264 struct backref_node *node,
1882 */ 2265 struct backref_edge *edges[], int *nr)
1883static struct btrfs_root *__select_one_root(struct btrfs_trans_handle *trans,
1884 struct backref_node *node,
1885 struct backref_edge *edges[],
1886 int *nr, int reloc_only)
1887{ 2266{
1888 struct backref_node *next; 2267 struct backref_node *next;
1889 struct btrfs_root *root; 2268 struct btrfs_root *root;
1890 int index; 2269 int index = 0;
1891 int loop = 0; 2270
1892again:
1893 index = 0;
1894 next = node; 2271 next = node;
1895 while (1) { 2272 while (1) {
1896 cond_resched(); 2273 cond_resched();
1897 next = walk_up_backref(next, edges, &index); 2274 next = walk_up_backref(next, edges, &index);
1898 root = next->root; 2275 root = next->root;
1899 if (!root) { 2276 BUG_ON(!root);
1900 BUG_ON(!node->old_root); 2277 BUG_ON(!root->ref_cows);
1901 goto skip;
1902 }
1903
1904 /* no other choice for non-refernce counted tree */
1905 if (!root->ref_cows) {
1906 BUG_ON(reloc_only);
1907 break;
1908 }
1909 2278
1910 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { 2279 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
1911 record_reloc_root_in_trans(trans, root); 2280 record_reloc_root_in_trans(trans, root);
1912 break; 2281 break;
1913 } 2282 }
1914 2283
1915 if (loop) { 2284 btrfs_record_root_in_trans(trans, root);
1916 btrfs_record_root_in_trans(trans, root); 2285 root = root->reloc_root;
2286
2287 if (next->new_bytenr != root->node->start) {
2288 BUG_ON(next->new_bytenr);
2289 BUG_ON(!list_empty(&next->list));
2290 next->new_bytenr = root->node->start;
2291 next->root = root;
2292 list_add_tail(&next->list,
2293 &rc->backref_cache.changed);
2294 __mark_block_processed(rc, next);
1917 break; 2295 break;
1918 } 2296 }
1919 2297
1920 if (reloc_only || next != node) { 2298 WARN_ON(1);
1921 if (!root->reloc_root)
1922 btrfs_record_root_in_trans(trans, root);
1923 root = root->reloc_root;
1924 /*
1925 * if the reloc tree was created in current
1926 * transation, there is no node in backref tree
1927 * corresponds to the root of the reloc tree.
1928 */
1929 if (btrfs_root_last_snapshot(&root->root_item) ==
1930 trans->transid - 1)
1931 break;
1932 }
1933skip:
1934 root = NULL; 2299 root = NULL;
1935 next = walk_down_backref(edges, &index); 2300 next = walk_down_backref(edges, &index);
1936 if (!next || next->level <= node->level) 2301 if (!next || next->level <= node->level)
1937 break; 2302 break;
1938 } 2303 }
2304 if (!root)
2305 return NULL;
1939 2306
1940 if (!root && !loop && !reloc_only) { 2307 *nr = index;
1941 loop = 1; 2308 next = node;
1942 goto again; 2309 /* setup backref node path for btrfs_reloc_cow_block */
2310 while (1) {
2311 rc->backref_cache.path[next->level] = next;
2312 if (--index < 0)
2313 break;
2314 next = edges[index]->node[UPPER];
1943 } 2315 }
1944
1945 if (root)
1946 *nr = index;
1947 else
1948 *nr = 0;
1949
1950 return root; 2316 return root;
1951} 2317}
1952 2318
2319/*
2320 * select a tree root for relocation. return NULL if the block
2321 * is reference counted. we should use do_relocation() in this
2322 * case. return a tree root pointer if the block isn't reference
2323 * counted. return -ENOENT if the block is root of reloc tree.
2324 */
1953static noinline_for_stack 2325static noinline_for_stack
1954struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans, 2326struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans,
1955 struct backref_node *node) 2327 struct backref_node *node)
1956{ 2328{
2329 struct backref_node *next;
2330 struct btrfs_root *root;
2331 struct btrfs_root *fs_root = NULL;
1957 struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2332 struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
1958 int nr; 2333 int index = 0;
1959 return __select_one_root(trans, node, edges, &nr, 0); 2334
2335 next = node;
2336 while (1) {
2337 cond_resched();
2338 next = walk_up_backref(next, edges, &index);
2339 root = next->root;
2340 BUG_ON(!root);
2341
2342 /* no other choice for non-refernce counted tree */
2343 if (!root->ref_cows)
2344 return root;
2345
2346 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)
2347 fs_root = root;
2348
2349 if (next != node)
2350 return NULL;
2351
2352 next = walk_down_backref(edges, &index);
2353 if (!next || next->level <= node->level)
2354 break;
2355 }
2356
2357 if (!fs_root)
2358 return ERR_PTR(-ENOENT);
2359 return fs_root;
1960} 2360}
1961 2361
1962static noinline_for_stack 2362static noinline_for_stack
1963struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, 2363u64 calcu_metadata_size(struct reloc_control *rc,
1964 struct backref_node *node, 2364 struct backref_node *node, int reserve)
1965 struct backref_edge *edges[], int *nr)
1966{ 2365{
1967 return __select_one_root(trans, node, edges, nr, 1); 2366 struct backref_node *next = node;
2367 struct backref_edge *edge;
2368 struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2369 u64 num_bytes = 0;
2370 int index = 0;
2371
2372 BUG_ON(reserve && node->processed);
2373
2374 while (next) {
2375 cond_resched();
2376 while (1) {
2377 if (next->processed && (reserve || next != node))
2378 break;
2379
2380 num_bytes += btrfs_level_size(rc->extent_root,
2381 next->level);
2382
2383 if (list_empty(&next->upper))
2384 break;
2385
2386 edge = list_entry(next->upper.next,
2387 struct backref_edge, list[LOWER]);
2388 edges[index++] = edge;
2389 next = edge->node[UPPER];
2390 }
2391 next = walk_down_backref(edges, &index);
2392 }
2393 return num_bytes;
1968} 2394}
1969 2395
1970static void grab_path_buffers(struct btrfs_path *path, 2396static int reserve_metadata_space(struct btrfs_trans_handle *trans,
1971 struct backref_node *node, 2397 struct reloc_control *rc,
1972 struct backref_edge *edges[], int nr) 2398 struct backref_node *node)
1973{ 2399{
1974 int i = 0; 2400 struct btrfs_root *root = rc->extent_root;
1975 while (1) { 2401 u64 num_bytes;
1976 drop_node_buffer(node); 2402 int ret;
1977 node->eb = path->nodes[node->level]; 2403
1978 BUG_ON(!node->eb); 2404 num_bytes = calcu_metadata_size(rc, node, 1) * 2;
1979 if (path->locks[node->level])
1980 node->locked = 1;
1981 path->nodes[node->level] = NULL;
1982 path->locks[node->level] = 0;
1983
1984 if (i >= nr)
1985 break;
1986 2405
1987 edges[i]->blockptr = node->eb->start; 2406 trans->block_rsv = rc->block_rsv;
1988 node = edges[i]->node[UPPER]; 2407 ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes,
1989 i++; 2408 &rc->block_rsv_retries);
2409 if (ret) {
2410 if (ret == -EAGAIN)
2411 rc->commit_transaction = 1;
2412 return ret;
1990 } 2413 }
2414
2415 rc->block_rsv_retries = 0;
2416 return 0;
2417}
2418
2419static void release_metadata_space(struct reloc_control *rc,
2420 struct backref_node *node)
2421{
2422 u64 num_bytes = calcu_metadata_size(rc, node, 0) * 2;
2423 btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, num_bytes);
1991} 2424}
1992 2425
1993/* 2426/*
@@ -1998,6 +2431,7 @@ static void grab_path_buffers(struct btrfs_path *path,
1998 * in that case this function just updates pointers. 2431 * in that case this function just updates pointers.
1999 */ 2432 */
2000static int do_relocation(struct btrfs_trans_handle *trans, 2433static int do_relocation(struct btrfs_trans_handle *trans,
2434 struct reloc_control *rc,
2001 struct backref_node *node, 2435 struct backref_node *node,
2002 struct btrfs_key *key, 2436 struct btrfs_key *key,
2003 struct btrfs_path *path, int lowest) 2437 struct btrfs_path *path, int lowest)
@@ -2018,18 +2452,25 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2018 BUG_ON(lowest && node->eb); 2452 BUG_ON(lowest && node->eb);
2019 2453
2020 path->lowest_level = node->level + 1; 2454 path->lowest_level = node->level + 1;
2455 rc->backref_cache.path[node->level] = node;
2021 list_for_each_entry(edge, &node->upper, list[LOWER]) { 2456 list_for_each_entry(edge, &node->upper, list[LOWER]) {
2022 cond_resched(); 2457 cond_resched();
2023 if (node->eb && node->eb->start == edge->blockptr)
2024 continue;
2025 2458
2026 upper = edge->node[UPPER]; 2459 upper = edge->node[UPPER];
2027 root = select_reloc_root(trans, upper, edges, &nr); 2460 root = select_reloc_root(trans, rc, upper, edges, &nr);
2028 if (!root) 2461 BUG_ON(!root);
2029 continue; 2462
2030 2463 if (upper->eb && !upper->locked) {
2031 if (upper->eb && !upper->locked) 2464 if (!lowest) {
2465 ret = btrfs_bin_search(upper->eb, key,
2466 upper->level, &slot);
2467 BUG_ON(ret);
2468 bytenr = btrfs_node_blockptr(upper->eb, slot);
2469 if (node->eb->start == bytenr)
2470 goto next;
2471 }
2032 drop_node_buffer(upper); 2472 drop_node_buffer(upper);
2473 }
2033 2474
2034 if (!upper->eb) { 2475 if (!upper->eb) {
2035 ret = btrfs_search_slot(trans, root, key, path, 0, 1); 2476 ret = btrfs_search_slot(trans, root, key, path, 0, 1);
@@ -2039,11 +2480,17 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2039 } 2480 }
2040 BUG_ON(ret > 0); 2481 BUG_ON(ret > 0);
2041 2482
2042 slot = path->slots[upper->level]; 2483 if (!upper->eb) {
2484 upper->eb = path->nodes[upper->level];
2485 path->nodes[upper->level] = NULL;
2486 } else {
2487 BUG_ON(upper->eb != path->nodes[upper->level]);
2488 }
2043 2489
2044 btrfs_unlock_up_safe(path, upper->level + 1); 2490 upper->locked = 1;
2045 grab_path_buffers(path, upper, edges, nr); 2491 path->locks[upper->level] = 0;
2046 2492
2493 slot = path->slots[upper->level];
2047 btrfs_release_path(NULL, path); 2494 btrfs_release_path(NULL, path);
2048 } else { 2495 } else {
2049 ret = btrfs_bin_search(upper->eb, key, upper->level, 2496 ret = btrfs_bin_search(upper->eb, key, upper->level,
@@ -2052,14 +2499,11 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2052 } 2499 }
2053 2500
2054 bytenr = btrfs_node_blockptr(upper->eb, slot); 2501 bytenr = btrfs_node_blockptr(upper->eb, slot);
2055 if (!lowest) { 2502 if (lowest) {
2056 if (node->eb->start == bytenr) { 2503 BUG_ON(bytenr != node->bytenr);
2057 btrfs_tree_unlock(upper->eb);
2058 upper->locked = 0;
2059 continue;
2060 }
2061 } else { 2504 } else {
2062 BUG_ON(node->bytenr != bytenr); 2505 if (node->eb->start == bytenr)
2506 goto next;
2063 } 2507 }
2064 2508
2065 blocksize = btrfs_level_size(root, node->level); 2509 blocksize = btrfs_level_size(root, node->level);
@@ -2071,13 +2515,13 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2071 if (!node->eb) { 2515 if (!node->eb) {
2072 ret = btrfs_cow_block(trans, root, eb, upper->eb, 2516 ret = btrfs_cow_block(trans, root, eb, upper->eb,
2073 slot, &eb); 2517 slot, &eb);
2518 btrfs_tree_unlock(eb);
2519 free_extent_buffer(eb);
2074 if (ret < 0) { 2520 if (ret < 0) {
2075 err = ret; 2521 err = ret;
2076 break; 2522 goto next;
2077 } 2523 }
2078 btrfs_set_lock_blocking(eb); 2524 BUG_ON(node->eb != eb);
2079 node->eb = eb;
2080 node->locked = 1;
2081 } else { 2525 } else {
2082 btrfs_set_node_blockptr(upper->eb, slot, 2526 btrfs_set_node_blockptr(upper->eb, slot,
2083 node->eb->start); 2527 node->eb->start);
@@ -2095,67 +2539,80 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2095 ret = btrfs_drop_subtree(trans, root, eb, upper->eb); 2539 ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
2096 BUG_ON(ret); 2540 BUG_ON(ret);
2097 } 2541 }
2098 if (!lowest) { 2542next:
2099 btrfs_tree_unlock(upper->eb); 2543 if (!upper->pending)
2100 upper->locked = 0; 2544 drop_node_buffer(upper);
2101 } 2545 else
2546 unlock_node_buffer(upper);
2547 if (err)
2548 break;
2102 } 2549 }
2550
2551 if (!err && node->pending) {
2552 drop_node_buffer(node);
2553 list_move_tail(&node->list, &rc->backref_cache.changed);
2554 node->pending = 0;
2555 }
2556
2103 path->lowest_level = 0; 2557 path->lowest_level = 0;
2558 BUG_ON(err == -ENOSPC);
2104 return err; 2559 return err;
2105} 2560}
2106 2561
2107static int link_to_upper(struct btrfs_trans_handle *trans, 2562static int link_to_upper(struct btrfs_trans_handle *trans,
2563 struct reloc_control *rc,
2108 struct backref_node *node, 2564 struct backref_node *node,
2109 struct btrfs_path *path) 2565 struct btrfs_path *path)
2110{ 2566{
2111 struct btrfs_key key; 2567 struct btrfs_key key;
2112 if (!node->eb || list_empty(&node->upper))
2113 return 0;
2114 2568
2115 btrfs_node_key_to_cpu(node->eb, &key, 0); 2569 btrfs_node_key_to_cpu(node->eb, &key, 0);
2116 return do_relocation(trans, node, &key, path, 0); 2570 return do_relocation(trans, rc, node, &key, path, 0);
2117} 2571}
2118 2572
2119static int finish_pending_nodes(struct btrfs_trans_handle *trans, 2573static int finish_pending_nodes(struct btrfs_trans_handle *trans,
2120 struct backref_cache *cache, 2574 struct reloc_control *rc,
2121 struct btrfs_path *path) 2575 struct btrfs_path *path, int err)
2122{ 2576{
2577 LIST_HEAD(list);
2578 struct backref_cache *cache = &rc->backref_cache;
2123 struct backref_node *node; 2579 struct backref_node *node;
2124 int level; 2580 int level;
2125 int ret; 2581 int ret;
2126 int err = 0;
2127 2582
2128 for (level = 0; level < BTRFS_MAX_LEVEL; level++) { 2583 for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
2129 while (!list_empty(&cache->pending[level])) { 2584 while (!list_empty(&cache->pending[level])) {
2130 node = list_entry(cache->pending[level].next, 2585 node = list_entry(cache->pending[level].next,
2131 struct backref_node, lower); 2586 struct backref_node, list);
2132 BUG_ON(node->level != level); 2587 list_move_tail(&node->list, &list);
2588 BUG_ON(!node->pending);
2133 2589
2134 ret = link_to_upper(trans, node, path); 2590 if (!err) {
2135 if (ret < 0) 2591 ret = link_to_upper(trans, rc, node, path);
2136 err = ret; 2592 if (ret < 0)
2137 /* 2593 err = ret;
2138 * this remove the node from the pending list and 2594 }
2139 * may add some other nodes to the level + 1
2140 * pending list
2141 */
2142 remove_backref_node(cache, node);
2143 } 2595 }
2596 list_splice_init(&list, &cache->pending[level]);
2144 } 2597 }
2145 BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root));
2146 return err; 2598 return err;
2147} 2599}
2148 2600
2149static void mark_block_processed(struct reloc_control *rc, 2601static void mark_block_processed(struct reloc_control *rc,
2150 struct backref_node *node) 2602 u64 bytenr, u32 blocksize)
2603{
2604 set_extent_bits(&rc->processed_blocks, bytenr, bytenr + blocksize - 1,
2605 EXTENT_DIRTY, GFP_NOFS);
2606}
2607
2608static void __mark_block_processed(struct reloc_control *rc,
2609 struct backref_node *node)
2151{ 2610{
2152 u32 blocksize; 2611 u32 blocksize;
2153 if (node->level == 0 || 2612 if (node->level == 0 ||
2154 in_block_group(node->bytenr, rc->block_group)) { 2613 in_block_group(node->bytenr, rc->block_group)) {
2155 blocksize = btrfs_level_size(rc->extent_root, node->level); 2614 blocksize = btrfs_level_size(rc->extent_root, node->level);
2156 set_extent_bits(&rc->processed_blocks, node->bytenr, 2615 mark_block_processed(rc, node->bytenr, blocksize);
2157 node->bytenr + blocksize - 1, EXTENT_DIRTY,
2158 GFP_NOFS);
2159 } 2616 }
2160 node->processed = 1; 2617 node->processed = 1;
2161} 2618}
@@ -2178,7 +2635,7 @@ static void update_processed_blocks(struct reloc_control *rc,
2178 if (next->processed) 2635 if (next->processed)
2179 break; 2636 break;
2180 2637
2181 mark_block_processed(rc, next); 2638 __mark_block_processed(rc, next);
2182 2639
2183 if (list_empty(&next->upper)) 2640 if (list_empty(&next->upper))
2184 break; 2641 break;
@@ -2201,138 +2658,6 @@ static int tree_block_processed(u64 bytenr, u32 blocksize,
2201 return 0; 2658 return 0;
2202} 2659}
2203 2660
2204/*
2205 * check if there are any file extent pointers in the leaf point to
2206 * data require processing
2207 */
2208static int check_file_extents(struct reloc_control *rc,
2209 u64 bytenr, u32 blocksize, u64 ptr_gen)
2210{
2211 struct btrfs_key found_key;
2212 struct btrfs_file_extent_item *fi;
2213 struct extent_buffer *leaf;
2214 u32 nritems;
2215 int i;
2216 int ret = 0;
2217
2218 leaf = read_tree_block(rc->extent_root, bytenr, blocksize, ptr_gen);
2219
2220 nritems = btrfs_header_nritems(leaf);
2221 for (i = 0; i < nritems; i++) {
2222 cond_resched();
2223 btrfs_item_key_to_cpu(leaf, &found_key, i);
2224 if (found_key.type != BTRFS_EXTENT_DATA_KEY)
2225 continue;
2226 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
2227 if (btrfs_file_extent_type(leaf, fi) ==
2228 BTRFS_FILE_EXTENT_INLINE)
2229 continue;
2230 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
2231 if (bytenr == 0)
2232 continue;
2233 if (in_block_group(bytenr, rc->block_group)) {
2234 ret = 1;
2235 break;
2236 }
2237 }
2238 free_extent_buffer(leaf);
2239 return ret;
2240}
2241
2242/*
2243 * scan child blocks of a given block to find blocks require processing
2244 */
2245static int add_child_blocks(struct btrfs_trans_handle *trans,
2246 struct reloc_control *rc,
2247 struct backref_node *node,
2248 struct rb_root *blocks)
2249{
2250 struct tree_block *block;
2251 struct rb_node *rb_node;
2252 u64 bytenr;
2253 u64 ptr_gen;
2254 u32 blocksize;
2255 u32 nritems;
2256 int i;
2257 int err = 0;
2258
2259 nritems = btrfs_header_nritems(node->eb);
2260 blocksize = btrfs_level_size(rc->extent_root, node->level - 1);
2261 for (i = 0; i < nritems; i++) {
2262 cond_resched();
2263 bytenr = btrfs_node_blockptr(node->eb, i);
2264 ptr_gen = btrfs_node_ptr_generation(node->eb, i);
2265 if (ptr_gen == trans->transid)
2266 continue;
2267 if (!in_block_group(bytenr, rc->block_group) &&
2268 (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
2269 continue;
2270 if (tree_block_processed(bytenr, blocksize, rc))
2271 continue;
2272
2273 readahead_tree_block(rc->extent_root,
2274 bytenr, blocksize, ptr_gen);
2275 }
2276
2277 for (i = 0; i < nritems; i++) {
2278 cond_resched();
2279 bytenr = btrfs_node_blockptr(node->eb, i);
2280 ptr_gen = btrfs_node_ptr_generation(node->eb, i);
2281 if (ptr_gen == trans->transid)
2282 continue;
2283 if (!in_block_group(bytenr, rc->block_group) &&
2284 (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
2285 continue;
2286 if (tree_block_processed(bytenr, blocksize, rc))
2287 continue;
2288 if (!in_block_group(bytenr, rc->block_group) &&
2289 !check_file_extents(rc, bytenr, blocksize, ptr_gen))
2290 continue;
2291
2292 block = kmalloc(sizeof(*block), GFP_NOFS);
2293 if (!block) {
2294 err = -ENOMEM;
2295 break;
2296 }
2297 block->bytenr = bytenr;
2298 btrfs_node_key_to_cpu(node->eb, &block->key, i);
2299 block->level = node->level - 1;
2300 block->key_ready = 1;
2301 rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
2302 BUG_ON(rb_node);
2303 }
2304 if (err)
2305 free_block_list(blocks);
2306 return err;
2307}
2308
2309/*
2310 * find adjacent blocks require processing
2311 */
2312static noinline_for_stack
2313int add_adjacent_blocks(struct btrfs_trans_handle *trans,
2314 struct reloc_control *rc,
2315 struct backref_cache *cache,
2316 struct rb_root *blocks, int level,
2317 struct backref_node **upper)
2318{
2319 struct backref_node *node;
2320 int ret = 0;
2321
2322 WARN_ON(!list_empty(&cache->pending[level]));
2323
2324 if (list_empty(&cache->pending[level + 1]))
2325 return 1;
2326
2327 node = list_entry(cache->pending[level + 1].next,
2328 struct backref_node, lower);
2329 if (node->eb)
2330 ret = add_child_blocks(trans, rc, node, blocks);
2331
2332 *upper = node;
2333 return ret;
2334}
2335
2336static int get_tree_block_key(struct reloc_control *rc, 2661static int get_tree_block_key(struct reloc_control *rc,
2337 struct tree_block *block) 2662 struct tree_block *block)
2338{ 2663{
@@ -2370,40 +2695,53 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
2370 struct btrfs_path *path) 2695 struct btrfs_path *path)
2371{ 2696{
2372 struct btrfs_root *root; 2697 struct btrfs_root *root;
2373 int ret; 2698 int release = 0;
2699 int ret = 0;
2374 2700
2701 if (!node)
2702 return 0;
2703
2704 BUG_ON(node->processed);
2375 root = select_one_root(trans, node); 2705 root = select_one_root(trans, node);
2376 if (unlikely(!root)) { 2706 if (root == ERR_PTR(-ENOENT)) {
2377 rc->found_old_snapshot = 1;
2378 update_processed_blocks(rc, node); 2707 update_processed_blocks(rc, node);
2379 return 0; 2708 goto out;
2380 } 2709 }
2381 2710
2382 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { 2711 if (!root || root->ref_cows) {
2383 ret = do_relocation(trans, node, key, path, 1); 2712 ret = reserve_metadata_space(trans, rc, node);
2384 if (ret < 0) 2713 if (ret)
2385 goto out;
2386 if (node->level == 0 && rc->stage == UPDATE_DATA_PTRS) {
2387 ret = replace_file_extents(trans, rc, root,
2388 node->eb, NULL);
2389 if (ret < 0)
2390 goto out;
2391 }
2392 drop_node_buffer(node);
2393 } else if (!root->ref_cows) {
2394 path->lowest_level = node->level;
2395 ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2396 btrfs_release_path(root, path);
2397 if (ret < 0)
2398 goto out; 2714 goto out;
2399 } else if (root != node->root) { 2715 release = 1;
2400 WARN_ON(node->level > 0 || rc->stage != UPDATE_DATA_PTRS);
2401 } 2716 }
2402 2717
2403 update_processed_blocks(rc, node); 2718 if (root) {
2404 ret = 0; 2719 if (root->ref_cows) {
2720 BUG_ON(node->new_bytenr);
2721 BUG_ON(!list_empty(&node->list));
2722 btrfs_record_root_in_trans(trans, root);
2723 root = root->reloc_root;
2724 node->new_bytenr = root->node->start;
2725 node->root = root;
2726 list_add_tail(&node->list, &rc->backref_cache.changed);
2727 } else {
2728 path->lowest_level = node->level;
2729 ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2730 btrfs_release_path(root, path);
2731 if (ret > 0)
2732 ret = 0;
2733 }
2734 if (!ret)
2735 update_processed_blocks(rc, node);
2736 } else {
2737 ret = do_relocation(trans, rc, node, key, path, 1);
2738 }
2405out: 2739out:
2406 drop_node_buffer(node); 2740 if (ret || node->level == 0 || node->cowonly) {
2741 if (release)
2742 release_metadata_space(rc, node);
2743 remove_backref_node(&rc->backref_cache, node);
2744 }
2407 return ret; 2745 return ret;
2408} 2746}
2409 2747
@@ -2414,12 +2752,10 @@ static noinline_for_stack
2414int relocate_tree_blocks(struct btrfs_trans_handle *trans, 2752int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2415 struct reloc_control *rc, struct rb_root *blocks) 2753 struct reloc_control *rc, struct rb_root *blocks)
2416{ 2754{
2417 struct backref_cache *cache;
2418 struct backref_node *node; 2755 struct backref_node *node;
2419 struct btrfs_path *path; 2756 struct btrfs_path *path;
2420 struct tree_block *block; 2757 struct tree_block *block;
2421 struct rb_node *rb_node; 2758 struct rb_node *rb_node;
2422 int level = -1;
2423 int ret; 2759 int ret;
2424 int err = 0; 2760 int err = 0;
2425 2761
@@ -2427,21 +2763,9 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2427 if (!path) 2763 if (!path)
2428 return -ENOMEM; 2764 return -ENOMEM;
2429 2765
2430 cache = kmalloc(sizeof(*cache), GFP_NOFS);
2431 if (!cache) {
2432 btrfs_free_path(path);
2433 return -ENOMEM;
2434 }
2435
2436 backref_cache_init(cache);
2437
2438 rb_node = rb_first(blocks); 2766 rb_node = rb_first(blocks);
2439 while (rb_node) { 2767 while (rb_node) {
2440 block = rb_entry(rb_node, struct tree_block, rb_node); 2768 block = rb_entry(rb_node, struct tree_block, rb_node);
2441 if (level == -1)
2442 level = block->level;
2443 else
2444 BUG_ON(level != block->level);
2445 if (!block->key_ready) 2769 if (!block->key_ready)
2446 reada_tree_block(rc, block); 2770 reada_tree_block(rc, block);
2447 rb_node = rb_next(rb_node); 2771 rb_node = rb_next(rb_node);
@@ -2459,7 +2783,7 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2459 while (rb_node) { 2783 while (rb_node) {
2460 block = rb_entry(rb_node, struct tree_block, rb_node); 2784 block = rb_entry(rb_node, struct tree_block, rb_node);
2461 2785
2462 node = build_backref_tree(rc, cache, &block->key, 2786 node = build_backref_tree(rc, &block->key,
2463 block->level, block->bytenr); 2787 block->level, block->bytenr);
2464 if (IS_ERR(node)) { 2788 if (IS_ERR(node)) {
2465 err = PTR_ERR(node); 2789 err = PTR_ERR(node);
@@ -2469,79 +2793,62 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2469 ret = relocate_tree_block(trans, rc, node, &block->key, 2793 ret = relocate_tree_block(trans, rc, node, &block->key,
2470 path); 2794 path);
2471 if (ret < 0) { 2795 if (ret < 0) {
2472 err = ret; 2796 if (ret != -EAGAIN || rb_node == rb_first(blocks))
2797 err = ret;
2473 goto out; 2798 goto out;
2474 } 2799 }
2475 remove_backref_node(cache, node);
2476 rb_node = rb_next(rb_node); 2800 rb_node = rb_next(rb_node);
2477 } 2801 }
2478 2802out:
2479 if (level > 0)
2480 goto out;
2481
2482 free_block_list(blocks); 2803 free_block_list(blocks);
2804 err = finish_pending_nodes(trans, rc, path, err);
2483 2805
2484 /* 2806 btrfs_free_path(path);
2485 * now backrefs of some upper level tree blocks have been cached, 2807 return err;
2486 * try relocating blocks referenced by these upper level blocks. 2808}
2487 */
2488 while (1) {
2489 struct backref_node *upper = NULL;
2490 if (trans->transaction->in_commit ||
2491 trans->transaction->delayed_refs.flushing)
2492 break;
2493 2809
2494 ret = add_adjacent_blocks(trans, rc, cache, blocks, level, 2810static noinline_for_stack
2495 &upper); 2811int prealloc_file_extent_cluster(struct inode *inode,
2496 if (ret < 0) 2812 struct file_extent_cluster *cluster)
2497 err = ret; 2813{
2498 if (ret != 0) 2814 u64 alloc_hint = 0;
2499 break; 2815 u64 start;
2816 u64 end;
2817 u64 offset = BTRFS_I(inode)->index_cnt;
2818 u64 num_bytes;
2819 int nr = 0;
2820 int ret = 0;
2500 2821
2501 rb_node = rb_first(blocks); 2822 BUG_ON(cluster->start != cluster->boundary[0]);
2502 while (rb_node) { 2823 mutex_lock(&inode->i_mutex);
2503 block = rb_entry(rb_node, struct tree_block, rb_node);
2504 if (trans->transaction->in_commit ||
2505 trans->transaction->delayed_refs.flushing)
2506 goto out;
2507 BUG_ON(!block->key_ready);
2508 node = build_backref_tree(rc, cache, &block->key,
2509 level, block->bytenr);
2510 if (IS_ERR(node)) {
2511 err = PTR_ERR(node);
2512 goto out;
2513 }
2514 2824
2515 ret = relocate_tree_block(trans, rc, node, 2825 ret = btrfs_check_data_free_space(inode, cluster->end +
2516 &block->key, path); 2826 1 - cluster->start);
2517 if (ret < 0) { 2827 if (ret)
2518 err = ret; 2828 goto out;
2519 goto out;
2520 }
2521 remove_backref_node(cache, node);
2522 rb_node = rb_next(rb_node);
2523 }
2524 free_block_list(blocks);
2525 2829
2526 if (upper) { 2830 while (nr < cluster->nr) {
2527 ret = link_to_upper(trans, upper, path); 2831 start = cluster->boundary[nr] - offset;
2528 if (ret < 0) { 2832 if (nr + 1 < cluster->nr)
2529 err = ret; 2833 end = cluster->boundary[nr + 1] - 1 - offset;
2530 break; 2834 else
2531 } 2835 end = cluster->end - offset;
2532 remove_backref_node(cache, upper); 2836
2533 } 2837 lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
2838 num_bytes = end + 1 - start;
2839 ret = btrfs_prealloc_file_range(inode, 0, start,
2840 num_bytes, num_bytes,
2841 end + 1, &alloc_hint);
2842 unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
2843 if (ret)
2844 break;
2845 nr++;
2534 } 2846 }
2847 btrfs_free_reserved_data_space(inode, cluster->end +
2848 1 - cluster->start);
2535out: 2849out:
2536 free_block_list(blocks); 2850 mutex_unlock(&inode->i_mutex);
2537 2851 return ret;
2538 ret = finish_pending_nodes(trans, cache, path);
2539 if (ret < 0)
2540 err = ret;
2541
2542 kfree(cache);
2543 btrfs_free_path(path);
2544 return err;
2545} 2852}
2546 2853
2547static noinline_for_stack 2854static noinline_for_stack
@@ -2587,7 +2894,6 @@ static int relocate_file_extent_cluster(struct inode *inode,
2587 u64 offset = BTRFS_I(inode)->index_cnt; 2894 u64 offset = BTRFS_I(inode)->index_cnt;
2588 unsigned long index; 2895 unsigned long index;
2589 unsigned long last_index; 2896 unsigned long last_index;
2590 unsigned int dirty_page = 0;
2591 struct page *page; 2897 struct page *page;
2592 struct file_ra_state *ra; 2898 struct file_ra_state *ra;
2593 int nr = 0; 2899 int nr = 0;
@@ -2600,21 +2906,24 @@ static int relocate_file_extent_cluster(struct inode *inode,
2600 if (!ra) 2906 if (!ra)
2601 return -ENOMEM; 2907 return -ENOMEM;
2602 2908
2603 index = (cluster->start - offset) >> PAGE_CACHE_SHIFT; 2909 ret = prealloc_file_extent_cluster(inode, cluster);
2604 last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT; 2910 if (ret)
2911 goto out;
2605 2912
2606 mutex_lock(&inode->i_mutex); 2913 file_ra_state_init(ra, inode->i_mapping);
2607 2914
2608 i_size_write(inode, cluster->end + 1 - offset);
2609 ret = setup_extent_mapping(inode, cluster->start - offset, 2915 ret = setup_extent_mapping(inode, cluster->start - offset,
2610 cluster->end - offset, cluster->start); 2916 cluster->end - offset, cluster->start);
2611 if (ret) 2917 if (ret)
2612 goto out_unlock; 2918 goto out;
2613
2614 file_ra_state_init(ra, inode->i_mapping);
2615 2919
2616 WARN_ON(cluster->start != cluster->boundary[0]); 2920 index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
2921 last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
2617 while (index <= last_index) { 2922 while (index <= last_index) {
2923 ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE);
2924 if (ret)
2925 goto out;
2926
2618 page = find_lock_page(inode->i_mapping, index); 2927 page = find_lock_page(inode->i_mapping, index);
2619 if (!page) { 2928 if (!page) {
2620 page_cache_sync_readahead(inode->i_mapping, 2929 page_cache_sync_readahead(inode->i_mapping,
@@ -2622,8 +2931,10 @@ static int relocate_file_extent_cluster(struct inode *inode,
2622 last_index + 1 - index); 2931 last_index + 1 - index);
2623 page = grab_cache_page(inode->i_mapping, index); 2932 page = grab_cache_page(inode->i_mapping, index);
2624 if (!page) { 2933 if (!page) {
2934 btrfs_delalloc_release_metadata(inode,
2935 PAGE_CACHE_SIZE);
2625 ret = -ENOMEM; 2936 ret = -ENOMEM;
2626 goto out_unlock; 2937 goto out;
2627 } 2938 }
2628 } 2939 }
2629 2940
@@ -2639,8 +2950,10 @@ static int relocate_file_extent_cluster(struct inode *inode,
2639 if (!PageUptodate(page)) { 2950 if (!PageUptodate(page)) {
2640 unlock_page(page); 2951 unlock_page(page);
2641 page_cache_release(page); 2952 page_cache_release(page);
2953 btrfs_delalloc_release_metadata(inode,
2954 PAGE_CACHE_SIZE);
2642 ret = -EIO; 2955 ret = -EIO;
2643 goto out_unlock; 2956 goto out;
2644 } 2957 }
2645 } 2958 }
2646 2959
@@ -2659,10 +2972,9 @@ static int relocate_file_extent_cluster(struct inode *inode,
2659 EXTENT_BOUNDARY, GFP_NOFS); 2972 EXTENT_BOUNDARY, GFP_NOFS);
2660 nr++; 2973 nr++;
2661 } 2974 }
2662 btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
2663 2975
2976 btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
2664 set_page_dirty(page); 2977 set_page_dirty(page);
2665 dirty_page++;
2666 2978
2667 unlock_extent(&BTRFS_I(inode)->io_tree, 2979 unlock_extent(&BTRFS_I(inode)->io_tree,
2668 page_start, page_end, GFP_NOFS); 2980 page_start, page_end, GFP_NOFS);
@@ -2670,20 +2982,11 @@ static int relocate_file_extent_cluster(struct inode *inode,
2670 page_cache_release(page); 2982 page_cache_release(page);
2671 2983
2672 index++; 2984 index++;
2673 if (nr < cluster->nr && 2985 balance_dirty_pages_ratelimited(inode->i_mapping);
2674 page_end + 1 + offset == cluster->boundary[nr]) { 2986 btrfs_throttle(BTRFS_I(inode)->root);
2675 balance_dirty_pages_ratelimited_nr(inode->i_mapping,
2676 dirty_page);
2677 dirty_page = 0;
2678 }
2679 }
2680 if (dirty_page) {
2681 balance_dirty_pages_ratelimited_nr(inode->i_mapping,
2682 dirty_page);
2683 } 2987 }
2684 WARN_ON(nr != cluster->nr); 2988 WARN_ON(nr != cluster->nr);
2685out_unlock: 2989out:
2686 mutex_unlock(&inode->i_mutex);
2687 kfree(ra); 2990 kfree(ra);
2688 return ret; 2991 return ret;
2689} 2992}
@@ -2869,9 +3172,6 @@ out:
2869static int block_use_full_backref(struct reloc_control *rc, 3172static int block_use_full_backref(struct reloc_control *rc,
2870 struct extent_buffer *eb) 3173 struct extent_buffer *eb)
2871{ 3174{
2872 struct btrfs_path *path;
2873 struct btrfs_extent_item *ei;
2874 struct btrfs_key key;
2875 u64 flags; 3175 u64 flags;
2876 int ret; 3176 int ret;
2877 3177
@@ -2879,28 +3179,14 @@ static int block_use_full_backref(struct reloc_control *rc,
2879 btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV) 3179 btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV)
2880 return 1; 3180 return 1;
2881 3181
2882 path = btrfs_alloc_path(); 3182 ret = btrfs_lookup_extent_info(NULL, rc->extent_root,
2883 BUG_ON(!path); 3183 eb->start, eb->len, NULL, &flags);
2884
2885 key.objectid = eb->start;
2886 key.type = BTRFS_EXTENT_ITEM_KEY;
2887 key.offset = eb->len;
2888
2889 path->search_commit_root = 1;
2890 path->skip_locking = 1;
2891 ret = btrfs_search_slot(NULL, rc->extent_root,
2892 &key, path, 0, 0);
2893 BUG_ON(ret); 3184 BUG_ON(ret);
2894 3185
2895 ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
2896 struct btrfs_extent_item);
2897 flags = btrfs_extent_flags(path->nodes[0], ei);
2898 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2899 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) 3186 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
2900 ret = 1; 3187 ret = 1;
2901 else 3188 else
2902 ret = 0; 3189 ret = 0;
2903 btrfs_free_path(path);
2904 return ret; 3190 return ret;
2905} 3191}
2906 3192
@@ -3073,22 +3359,10 @@ int add_data_references(struct reloc_control *rc,
3073 struct btrfs_extent_inline_ref *iref; 3359 struct btrfs_extent_inline_ref *iref;
3074 unsigned long ptr; 3360 unsigned long ptr;
3075 unsigned long end; 3361 unsigned long end;
3076 u32 blocksize; 3362 u32 blocksize = btrfs_level_size(rc->extent_root, 0);
3077 int ret; 3363 int ret;
3078 int err = 0; 3364 int err = 0;
3079 3365
3080 ret = get_new_location(rc->data_inode, NULL, extent_key->objectid,
3081 extent_key->offset);
3082 BUG_ON(ret < 0);
3083 if (ret > 0) {
3084 /* the relocated data is fragmented */
3085 rc->extents_skipped++;
3086 btrfs_release_path(rc->extent_root, path);
3087 return 0;
3088 }
3089
3090 blocksize = btrfs_level_size(rc->extent_root, 0);
3091
3092 eb = path->nodes[0]; 3366 eb = path->nodes[0];
3093 ptr = btrfs_item_ptr_offset(eb, path->slots[0]); 3367 ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
3094 end = ptr + btrfs_item_size_nr(eb, path->slots[0]); 3368 end = ptr + btrfs_item_size_nr(eb, path->slots[0]);
@@ -3169,7 +3443,8 @@ int add_data_references(struct reloc_control *rc,
3169 */ 3443 */
3170static noinline_for_stack 3444static noinline_for_stack
3171int find_next_extent(struct btrfs_trans_handle *trans, 3445int find_next_extent(struct btrfs_trans_handle *trans,
3172 struct reloc_control *rc, struct btrfs_path *path) 3446 struct reloc_control *rc, struct btrfs_path *path,
3447 struct btrfs_key *extent_key)
3173{ 3448{
3174 struct btrfs_key key; 3449 struct btrfs_key key;
3175 struct extent_buffer *leaf; 3450 struct extent_buffer *leaf;
@@ -3224,6 +3499,7 @@ next:
3224 rc->search_start = end + 1; 3499 rc->search_start = end + 1;
3225 } else { 3500 } else {
3226 rc->search_start = key.objectid + key.offset; 3501 rc->search_start = key.objectid + key.offset;
3502 memcpy(extent_key, &key, sizeof(key));
3227 return 0; 3503 return 0;
3228 } 3504 }
3229 } 3505 }
@@ -3261,12 +3537,49 @@ static int check_extent_flags(u64 flags)
3261 return 0; 3537 return 0;
3262} 3538}
3263 3539
3540static noinline_for_stack
3541int prepare_to_relocate(struct reloc_control *rc)
3542{
3543 struct btrfs_trans_handle *trans;
3544 int ret;
3545
3546 rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root);
3547 if (!rc->block_rsv)
3548 return -ENOMEM;
3549
3550 /*
3551 * reserve some space for creating reloc trees.
3552 * btrfs_init_reloc_root will use them when there
3553 * is no reservation in transaction handle.
3554 */
3555 ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv,
3556 rc->extent_root->nodesize * 256,
3557 &rc->block_rsv_retries);
3558 if (ret)
3559 return ret;
3560
3561 rc->block_rsv->refill_used = 1;
3562 btrfs_add_durable_block_rsv(rc->extent_root->fs_info, rc->block_rsv);
3563
3564 memset(&rc->cluster, 0, sizeof(rc->cluster));
3565 rc->search_start = rc->block_group->key.objectid;
3566 rc->extents_found = 0;
3567 rc->nodes_relocated = 0;
3568 rc->merging_rsv_size = 0;
3569 rc->block_rsv_retries = 0;
3570
3571 rc->create_reloc_tree = 1;
3572 set_reloc_control(rc);
3573
3574 trans = btrfs_join_transaction(rc->extent_root, 1);
3575 btrfs_commit_transaction(trans, rc->extent_root);
3576 return 0;
3577}
3264 3578
3265static noinline_for_stack int relocate_block_group(struct reloc_control *rc) 3579static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3266{ 3580{
3267 struct rb_root blocks = RB_ROOT; 3581 struct rb_root blocks = RB_ROOT;
3268 struct btrfs_key key; 3582 struct btrfs_key key;
3269 struct file_extent_cluster *cluster;
3270 struct btrfs_trans_handle *trans = NULL; 3583 struct btrfs_trans_handle *trans = NULL;
3271 struct btrfs_path *path; 3584 struct btrfs_path *path;
3272 struct btrfs_extent_item *ei; 3585 struct btrfs_extent_item *ei;
@@ -3276,33 +3589,25 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3276 int ret; 3589 int ret;
3277 int err = 0; 3590 int err = 0;
3278 3591
3279 cluster = kzalloc(sizeof(*cluster), GFP_NOFS);
3280 if (!cluster)
3281 return -ENOMEM;
3282
3283 path = btrfs_alloc_path(); 3592 path = btrfs_alloc_path();
3284 if (!path) { 3593 if (!path)
3285 kfree(cluster);
3286 return -ENOMEM; 3594 return -ENOMEM;
3287 }
3288
3289 rc->extents_found = 0;
3290 rc->extents_skipped = 0;
3291
3292 rc->search_start = rc->block_group->key.objectid;
3293 clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
3294 GFP_NOFS);
3295
3296 rc->create_reloc_root = 1;
3297 set_reloc_control(rc);
3298 3595
3299 trans = btrfs_start_transaction(rc->extent_root, 1); 3596 ret = prepare_to_relocate(rc);
3300 btrfs_commit_transaction(trans, rc->extent_root); 3597 if (ret) {
3598 err = ret;
3599 goto out_free;
3600 }
3301 3601
3302 while (1) { 3602 while (1) {
3303 trans = btrfs_start_transaction(rc->extent_root, 1); 3603 trans = btrfs_start_transaction(rc->extent_root, 0);
3604
3605 if (update_backref_cache(trans, &rc->backref_cache)) {
3606 btrfs_end_transaction(trans, rc->extent_root);
3607 continue;
3608 }
3304 3609
3305 ret = find_next_extent(trans, rc, path); 3610 ret = find_next_extent(trans, rc, path, &key);
3306 if (ret < 0) 3611 if (ret < 0)
3307 err = ret; 3612 err = ret;
3308 if (ret != 0) 3613 if (ret != 0)
@@ -3312,9 +3617,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3312 3617
3313 ei = btrfs_item_ptr(path->nodes[0], path->slots[0], 3618 ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
3314 struct btrfs_extent_item); 3619 struct btrfs_extent_item);
3315 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 3620 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
3316 item_size = btrfs_item_size_nr(path->nodes[0],
3317 path->slots[0]);
3318 if (item_size >= sizeof(*ei)) { 3621 if (item_size >= sizeof(*ei)) {
3319 flags = btrfs_extent_flags(path->nodes[0], ei); 3622 flags = btrfs_extent_flags(path->nodes[0], ei);
3320 ret = check_extent_flags(flags); 3623 ret = check_extent_flags(flags);
@@ -3355,73 +3658,100 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3355 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 3658 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
3356 ret = add_tree_block(rc, &key, path, &blocks); 3659 ret = add_tree_block(rc, &key, path, &blocks);
3357 } else if (rc->stage == UPDATE_DATA_PTRS && 3660 } else if (rc->stage == UPDATE_DATA_PTRS &&
3358 (flags & BTRFS_EXTENT_FLAG_DATA)) { 3661 (flags & BTRFS_EXTENT_FLAG_DATA)) {
3359 ret = add_data_references(rc, &key, path, &blocks); 3662 ret = add_data_references(rc, &key, path, &blocks);
3360 } else { 3663 } else {
3361 btrfs_release_path(rc->extent_root, path); 3664 btrfs_release_path(rc->extent_root, path);
3362 ret = 0; 3665 ret = 0;
3363 } 3666 }
3364 if (ret < 0) { 3667 if (ret < 0) {
3365 err = 0; 3668 err = ret;
3366 break; 3669 break;
3367 } 3670 }
3368 3671
3369 if (!RB_EMPTY_ROOT(&blocks)) { 3672 if (!RB_EMPTY_ROOT(&blocks)) {
3370 ret = relocate_tree_blocks(trans, rc, &blocks); 3673 ret = relocate_tree_blocks(trans, rc, &blocks);
3371 if (ret < 0) { 3674 if (ret < 0) {
3675 if (ret != -EAGAIN) {
3676 err = ret;
3677 break;
3678 }
3679 rc->extents_found--;
3680 rc->search_start = key.objectid;
3681 }
3682 }
3683
3684 ret = btrfs_block_rsv_check(trans, rc->extent_root,
3685 rc->block_rsv, 0, 5);
3686 if (ret < 0) {
3687 if (ret != -EAGAIN) {
3372 err = ret; 3688 err = ret;
3689 WARN_ON(1);
3373 break; 3690 break;
3374 } 3691 }
3692 rc->commit_transaction = 1;
3375 } 3693 }
3376 3694
3377 nr = trans->blocks_used; 3695 if (rc->commit_transaction) {
3378 btrfs_end_transaction(trans, rc->extent_root); 3696 rc->commit_transaction = 0;
3697 ret = btrfs_commit_transaction(trans, rc->extent_root);
3698 BUG_ON(ret);
3699 } else {
3700 nr = trans->blocks_used;
3701 btrfs_end_transaction_throttle(trans, rc->extent_root);
3702 btrfs_btree_balance_dirty(rc->extent_root, nr);
3703 }
3379 trans = NULL; 3704 trans = NULL;
3380 btrfs_btree_balance_dirty(rc->extent_root, nr);
3381 3705
3382 if (rc->stage == MOVE_DATA_EXTENTS && 3706 if (rc->stage == MOVE_DATA_EXTENTS &&
3383 (flags & BTRFS_EXTENT_FLAG_DATA)) { 3707 (flags & BTRFS_EXTENT_FLAG_DATA)) {
3384 rc->found_file_extent = 1; 3708 rc->found_file_extent = 1;
3385 ret = relocate_data_extent(rc->data_inode, 3709 ret = relocate_data_extent(rc->data_inode,
3386 &key, cluster); 3710 &key, &rc->cluster);
3387 if (ret < 0) { 3711 if (ret < 0) {
3388 err = ret; 3712 err = ret;
3389 break; 3713 break;
3390 } 3714 }
3391 } 3715 }
3392 } 3716 }
3393 btrfs_free_path(path); 3717
3718 btrfs_release_path(rc->extent_root, path);
3719 clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
3720 GFP_NOFS);
3394 3721
3395 if (trans) { 3722 if (trans) {
3396 nr = trans->blocks_used; 3723 nr = trans->blocks_used;
3397 btrfs_end_transaction(trans, rc->extent_root); 3724 btrfs_end_transaction_throttle(trans, rc->extent_root);
3398 btrfs_btree_balance_dirty(rc->extent_root, nr); 3725 btrfs_btree_balance_dirty(rc->extent_root, nr);
3399 } 3726 }
3400 3727
3401 if (!err) { 3728 if (!err) {
3402 ret = relocate_file_extent_cluster(rc->data_inode, cluster); 3729 ret = relocate_file_extent_cluster(rc->data_inode,
3730 &rc->cluster);
3403 if (ret < 0) 3731 if (ret < 0)
3404 err = ret; 3732 err = ret;
3405 } 3733 }
3406 3734
3407 kfree(cluster); 3735 rc->create_reloc_tree = 0;
3736 set_reloc_control(rc);
3408 3737
3409 rc->create_reloc_root = 0; 3738 backref_cache_cleanup(&rc->backref_cache);
3410 smp_mb(); 3739 btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
3411 3740
3412 if (rc->extents_found > 0) { 3741 err = prepare_to_merge(rc, err);
3413 trans = btrfs_start_transaction(rc->extent_root, 1);
3414 btrfs_commit_transaction(trans, rc->extent_root);
3415 }
3416 3742
3417 merge_reloc_roots(rc); 3743 merge_reloc_roots(rc);
3418 3744
3745 rc->merge_reloc_tree = 0;
3419 unset_reloc_control(rc); 3746 unset_reloc_control(rc);
3747 btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
3420 3748
3421 /* get rid of pinned extents */ 3749 /* get rid of pinned extents */
3422 trans = btrfs_start_transaction(rc->extent_root, 1); 3750 trans = btrfs_join_transaction(rc->extent_root, 1);
3423 btrfs_commit_transaction(trans, rc->extent_root); 3751 btrfs_commit_transaction(trans, rc->extent_root);
3424 3752out_free:
3753 btrfs_free_block_rsv(rc->extent_root, rc->block_rsv);
3754 btrfs_free_path(path);
3425 return err; 3755 return err;
3426} 3756}
3427 3757
@@ -3447,7 +3777,8 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
3447 btrfs_set_inode_generation(leaf, item, 1); 3777 btrfs_set_inode_generation(leaf, item, 1);
3448 btrfs_set_inode_size(leaf, item, 0); 3778 btrfs_set_inode_size(leaf, item, 0);
3449 btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); 3779 btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
3450 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS); 3780 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
3781 BTRFS_INODE_PREALLOC);
3451 btrfs_mark_buffer_dirty(leaf); 3782 btrfs_mark_buffer_dirty(leaf);
3452 btrfs_release_path(root, path); 3783 btrfs_release_path(root, path);
3453out: 3784out:
@@ -3459,8 +3790,9 @@ out:
3459 * helper to create inode for data relocation. 3790 * helper to create inode for data relocation.
3460 * the inode is in data relocation tree and its link count is 0 3791 * the inode is in data relocation tree and its link count is 0
3461 */ 3792 */
3462static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, 3793static noinline_for_stack
3463 struct btrfs_block_group_cache *group) 3794struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3795 struct btrfs_block_group_cache *group)
3464{ 3796{
3465 struct inode *inode = NULL; 3797 struct inode *inode = NULL;
3466 struct btrfs_trans_handle *trans; 3798 struct btrfs_trans_handle *trans;
@@ -3474,8 +3806,9 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3474 if (IS_ERR(root)) 3806 if (IS_ERR(root))
3475 return ERR_CAST(root); 3807 return ERR_CAST(root);
3476 3808
3477 trans = btrfs_start_transaction(root, 1); 3809 trans = btrfs_start_transaction(root, 6);
3478 BUG_ON(!trans); 3810 if (IS_ERR(trans))
3811 return ERR_CAST(trans);
3479 3812
3480 err = btrfs_find_free_objectid(trans, root, objectid, &objectid); 3813 err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
3481 if (err) 3814 if (err)
@@ -3495,7 +3828,6 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3495out: 3828out:
3496 nr = trans->blocks_used; 3829 nr = trans->blocks_used;
3497 btrfs_end_transaction(trans, root); 3830 btrfs_end_transaction(trans, root);
3498
3499 btrfs_btree_balance_dirty(root, nr); 3831 btrfs_btree_balance_dirty(root, nr);
3500 if (err) { 3832 if (err) {
3501 if (inode) 3833 if (inode)
@@ -3505,6 +3837,21 @@ out:
3505 return inode; 3837 return inode;
3506} 3838}
3507 3839
3840static struct reloc_control *alloc_reloc_control(void)
3841{
3842 struct reloc_control *rc;
3843
3844 rc = kzalloc(sizeof(*rc), GFP_NOFS);
3845 if (!rc)
3846 return NULL;
3847
3848 INIT_LIST_HEAD(&rc->reloc_roots);
3849 backref_cache_init(&rc->backref_cache);
3850 mapping_tree_init(&rc->reloc_root_tree);
3851 extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS);
3852 return rc;
3853}
3854
3508/* 3855/*
3509 * function to relocate all extents in a block group. 3856 * function to relocate all extents in a block group.
3510 */ 3857 */
@@ -3513,24 +3860,26 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3513 struct btrfs_fs_info *fs_info = extent_root->fs_info; 3860 struct btrfs_fs_info *fs_info = extent_root->fs_info;
3514 struct reloc_control *rc; 3861 struct reloc_control *rc;
3515 int ret; 3862 int ret;
3863 int rw = 0;
3516 int err = 0; 3864 int err = 0;
3517 3865
3518 rc = kzalloc(sizeof(*rc), GFP_NOFS); 3866 rc = alloc_reloc_control();
3519 if (!rc) 3867 if (!rc)
3520 return -ENOMEM; 3868 return -ENOMEM;
3521 3869
3522 mapping_tree_init(&rc->reloc_root_tree); 3870 rc->extent_root = extent_root;
3523 extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS);
3524 INIT_LIST_HEAD(&rc->reloc_roots);
3525 3871
3526 rc->block_group = btrfs_lookup_block_group(fs_info, group_start); 3872 rc->block_group = btrfs_lookup_block_group(fs_info, group_start);
3527 BUG_ON(!rc->block_group); 3873 BUG_ON(!rc->block_group);
3528 3874
3529 btrfs_init_workers(&rc->workers, "relocate", 3875 if (!rc->block_group->ro) {
3530 fs_info->thread_pool_size, NULL); 3876 ret = btrfs_set_block_group_ro(extent_root, rc->block_group);
3531 3877 if (ret) {
3532 rc->extent_root = extent_root; 3878 err = ret;
3533 btrfs_prepare_block_group_relocation(extent_root, rc->block_group); 3879 goto out;
3880 }
3881 rw = 1;
3882 }
3534 3883
3535 rc->data_inode = create_reloc_inode(fs_info, rc->block_group); 3884 rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
3536 if (IS_ERR(rc->data_inode)) { 3885 if (IS_ERR(rc->data_inode)) {
@@ -3547,9 +3896,6 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3547 btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0); 3896 btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0);
3548 3897
3549 while (1) { 3898 while (1) {
3550 rc->extents_found = 0;
3551 rc->extents_skipped = 0;
3552
3553 mutex_lock(&fs_info->cleaner_mutex); 3899 mutex_lock(&fs_info->cleaner_mutex);
3554 3900
3555 btrfs_clean_old_snapshots(fs_info->tree_root); 3901 btrfs_clean_old_snapshots(fs_info->tree_root);
@@ -3558,7 +3904,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3558 mutex_unlock(&fs_info->cleaner_mutex); 3904 mutex_unlock(&fs_info->cleaner_mutex);
3559 if (ret < 0) { 3905 if (ret < 0) {
3560 err = ret; 3906 err = ret;
3561 break; 3907 goto out;
3562 } 3908 }
3563 3909
3564 if (rc->extents_found == 0) 3910 if (rc->extents_found == 0)
@@ -3572,18 +3918,6 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3572 invalidate_mapping_pages(rc->data_inode->i_mapping, 3918 invalidate_mapping_pages(rc->data_inode->i_mapping,
3573 0, -1); 3919 0, -1);
3574 rc->stage = UPDATE_DATA_PTRS; 3920 rc->stage = UPDATE_DATA_PTRS;
3575 } else if (rc->stage == UPDATE_DATA_PTRS &&
3576 rc->extents_skipped >= rc->extents_found) {
3577 iput(rc->data_inode);
3578 rc->data_inode = create_reloc_inode(fs_info,
3579 rc->block_group);
3580 if (IS_ERR(rc->data_inode)) {
3581 err = PTR_ERR(rc->data_inode);
3582 rc->data_inode = NULL;
3583 break;
3584 }
3585 rc->stage = MOVE_DATA_EXTENTS;
3586 rc->found_file_extent = 0;
3587 } 3921 }
3588 } 3922 }
3589 3923
@@ -3596,8 +3930,9 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3596 WARN_ON(rc->block_group->reserved > 0); 3930 WARN_ON(rc->block_group->reserved > 0);
3597 WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0); 3931 WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
3598out: 3932out:
3933 if (err && rw)
3934 btrfs_set_block_group_rw(extent_root, rc->block_group);
3599 iput(rc->data_inode); 3935 iput(rc->data_inode);
3600 btrfs_stop_workers(&rc->workers);
3601 btrfs_put_block_group(rc->block_group); 3936 btrfs_put_block_group(rc->block_group);
3602 kfree(rc); 3937 kfree(rc);
3603 return err; 3938 return err;
@@ -3608,7 +3943,7 @@ static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
3608 struct btrfs_trans_handle *trans; 3943 struct btrfs_trans_handle *trans;
3609 int ret; 3944 int ret;
3610 3945
3611 trans = btrfs_start_transaction(root->fs_info->tree_root, 1); 3946 trans = btrfs_start_transaction(root->fs_info->tree_root, 0);
3612 3947
3613 memset(&root->root_item.drop_progress, 0, 3948 memset(&root->root_item.drop_progress, 0,
3614 sizeof(root->root_item.drop_progress)); 3949 sizeof(root->root_item.drop_progress));
@@ -3701,20 +4036,20 @@ int btrfs_recover_relocation(struct btrfs_root *root)
3701 if (list_empty(&reloc_roots)) 4036 if (list_empty(&reloc_roots))
3702 goto out; 4037 goto out;
3703 4038
3704 rc = kzalloc(sizeof(*rc), GFP_NOFS); 4039 rc = alloc_reloc_control();
3705 if (!rc) { 4040 if (!rc) {
3706 err = -ENOMEM; 4041 err = -ENOMEM;
3707 goto out; 4042 goto out;
3708 } 4043 }
3709 4044
3710 mapping_tree_init(&rc->reloc_root_tree);
3711 INIT_LIST_HEAD(&rc->reloc_roots);
3712 btrfs_init_workers(&rc->workers, "relocate",
3713 root->fs_info->thread_pool_size, NULL);
3714 rc->extent_root = root->fs_info->extent_root; 4045 rc->extent_root = root->fs_info->extent_root;
3715 4046
3716 set_reloc_control(rc); 4047 set_reloc_control(rc);
3717 4048
4049 trans = btrfs_join_transaction(rc->extent_root, 1);
4050
4051 rc->merge_reloc_tree = 1;
4052
3718 while (!list_empty(&reloc_roots)) { 4053 while (!list_empty(&reloc_roots)) {
3719 reloc_root = list_entry(reloc_roots.next, 4054 reloc_root = list_entry(reloc_roots.next,
3720 struct btrfs_root, root_list); 4055 struct btrfs_root, root_list);
@@ -3734,20 +4069,16 @@ int btrfs_recover_relocation(struct btrfs_root *root)
3734 fs_root->reloc_root = reloc_root; 4069 fs_root->reloc_root = reloc_root;
3735 } 4070 }
3736 4071
3737 trans = btrfs_start_transaction(rc->extent_root, 1);
3738 btrfs_commit_transaction(trans, rc->extent_root); 4072 btrfs_commit_transaction(trans, rc->extent_root);
3739 4073
3740 merge_reloc_roots(rc); 4074 merge_reloc_roots(rc);
3741 4075
3742 unset_reloc_control(rc); 4076 unset_reloc_control(rc);
3743 4077
3744 trans = btrfs_start_transaction(rc->extent_root, 1); 4078 trans = btrfs_join_transaction(rc->extent_root, 1);
3745 btrfs_commit_transaction(trans, rc->extent_root); 4079 btrfs_commit_transaction(trans, rc->extent_root);
3746out: 4080out:
3747 if (rc) { 4081 kfree(rc);
3748 btrfs_stop_workers(&rc->workers);
3749 kfree(rc);
3750 }
3751 while (!list_empty(&reloc_roots)) { 4082 while (!list_empty(&reloc_roots)) {
3752 reloc_root = list_entry(reloc_roots.next, 4083 reloc_root = list_entry(reloc_roots.next,
3753 struct btrfs_root, root_list); 4084 struct btrfs_root, root_list);
@@ -3813,3 +4144,130 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
3813 btrfs_put_ordered_extent(ordered); 4144 btrfs_put_ordered_extent(ordered);
3814 return 0; 4145 return 0;
3815} 4146}
4147
4148void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
4149 struct btrfs_root *root, struct extent_buffer *buf,
4150 struct extent_buffer *cow)
4151{
4152 struct reloc_control *rc;
4153 struct backref_node *node;
4154 int first_cow = 0;
4155 int level;
4156 int ret;
4157
4158 rc = root->fs_info->reloc_ctl;
4159 if (!rc)
4160 return;
4161
4162 BUG_ON(rc->stage == UPDATE_DATA_PTRS &&
4163 root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID);
4164
4165 level = btrfs_header_level(buf);
4166 if (btrfs_header_generation(buf) <=
4167 btrfs_root_last_snapshot(&root->root_item))
4168 first_cow = 1;
4169
4170 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID &&
4171 rc->create_reloc_tree) {
4172 WARN_ON(!first_cow && level == 0);
4173
4174 node = rc->backref_cache.path[level];
4175 BUG_ON(node->bytenr != buf->start &&
4176 node->new_bytenr != buf->start);
4177
4178 drop_node_buffer(node);
4179 extent_buffer_get(cow);
4180 node->eb = cow;
4181 node->new_bytenr = cow->start;
4182
4183 if (!node->pending) {
4184 list_move_tail(&node->list,
4185 &rc->backref_cache.pending[level]);
4186 node->pending = 1;
4187 }
4188
4189 if (first_cow)
4190 __mark_block_processed(rc, node);
4191
4192 if (first_cow && level > 0)
4193 rc->nodes_relocated += buf->len;
4194 }
4195
4196 if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS) {
4197 ret = replace_file_extents(trans, rc, root, cow);
4198 BUG_ON(ret);
4199 }
4200}
4201
4202/*
4203 * called before creating snapshot. it calculates metadata reservation
4204 * requried for relocating tree blocks in the snapshot
4205 */
4206void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
4207 struct btrfs_pending_snapshot *pending,
4208 u64 *bytes_to_reserve)
4209{
4210 struct btrfs_root *root;
4211 struct reloc_control *rc;
4212
4213 root = pending->root;
4214 if (!root->reloc_root)
4215 return;
4216
4217 rc = root->fs_info->reloc_ctl;
4218 if (!rc->merge_reloc_tree)
4219 return;
4220
4221 root = root->reloc_root;
4222 BUG_ON(btrfs_root_refs(&root->root_item) == 0);
4223 /*
4224 * relocation is in the stage of merging trees. the space
4225 * used by merging a reloc tree is twice the size of
4226 * relocated tree nodes in the worst case. half for cowing
4227 * the reloc tree, half for cowing the fs tree. the space
4228 * used by cowing the reloc tree will be freed after the
4229 * tree is dropped. if we create snapshot, cowing the fs
4230 * tree may use more space than it frees. so we need
4231 * reserve extra space.
4232 */
4233 *bytes_to_reserve += rc->nodes_relocated;
4234}
4235
4236/*
4237 * called after snapshot is created. migrate block reservation
4238 * and create reloc root for the newly created snapshot
4239 */
4240void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
4241 struct btrfs_pending_snapshot *pending)
4242{
4243 struct btrfs_root *root = pending->root;
4244 struct btrfs_root *reloc_root;
4245 struct btrfs_root *new_root;
4246 struct reloc_control *rc;
4247 int ret;
4248
4249 if (!root->reloc_root)
4250 return;
4251
4252 rc = root->fs_info->reloc_ctl;
4253 rc->merging_rsv_size += rc->nodes_relocated;
4254
4255 if (rc->merge_reloc_tree) {
4256 ret = btrfs_block_rsv_migrate(&pending->block_rsv,
4257 rc->block_rsv,
4258 rc->nodes_relocated);
4259 BUG_ON(ret);
4260 }
4261
4262 new_root = pending->snap;
4263 reloc_root = create_reloc_root(trans, root->reloc_root,
4264 new_root->root_key.objectid);
4265
4266 __add_reloc_root(reloc_root);
4267 new_root->reloc_root = reloc_root;
4268
4269 if (rc->create_reloc_tree) {
4270 ret = clone_backref_node(trans, rc, root, reloc_root);
4271 BUG_ON(ret);
4272 }
4273}
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 67fa2d29d663..b91ccd972644 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -259,6 +259,8 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
259 struct extent_buffer *leaf; 259 struct extent_buffer *leaf;
260 struct btrfs_path *path; 260 struct btrfs_path *path;
261 struct btrfs_key key; 261 struct btrfs_key key;
262 struct btrfs_key root_key;
263 struct btrfs_root *root;
262 int err = 0; 264 int err = 0;
263 int ret; 265 int ret;
264 266
@@ -270,6 +272,9 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
270 key.type = BTRFS_ORPHAN_ITEM_KEY; 272 key.type = BTRFS_ORPHAN_ITEM_KEY;
271 key.offset = 0; 273 key.offset = 0;
272 274
275 root_key.type = BTRFS_ROOT_ITEM_KEY;
276 root_key.offset = (u64)-1;
277
273 while (1) { 278 while (1) {
274 ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0); 279 ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
275 if (ret < 0) { 280 if (ret < 0) {
@@ -294,13 +299,25 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
294 key.type != BTRFS_ORPHAN_ITEM_KEY) 299 key.type != BTRFS_ORPHAN_ITEM_KEY)
295 break; 300 break;
296 301
297 ret = btrfs_find_dead_roots(tree_root, key.offset); 302 root_key.objectid = key.offset;
298 if (ret) { 303 key.offset++;
304
305 root = btrfs_read_fs_root_no_name(tree_root->fs_info,
306 &root_key);
307 if (!IS_ERR(root))
308 continue;
309
310 ret = PTR_ERR(root);
311 if (ret != -ENOENT) {
299 err = ret; 312 err = ret;
300 break; 313 break;
301 } 314 }
302 315
303 key.offset++; 316 ret = btrfs_find_dead_roots(tree_root, root_key.objectid);
317 if (ret) {
318 err = ret;
319 break;
320 }
304 } 321 }
305 322
306 btrfs_free_path(path); 323 btrfs_free_path(path);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 9ac612e6ca60..d34b2dfc9628 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -38,6 +38,7 @@
38#include <linux/namei.h> 38#include <linux/namei.h>
39#include <linux/miscdevice.h> 39#include <linux/miscdevice.h>
40#include <linux/magic.h> 40#include <linux/magic.h>
41#include <linux/slab.h>
41#include "compat.h" 42#include "compat.h"
42#include "ctree.h" 43#include "ctree.h"
43#include "disk-io.h" 44#include "disk-io.h"
@@ -64,10 +65,9 @@ static void btrfs_put_super(struct super_block *sb)
64 65
65enum { 66enum {
66 Opt_degraded, Opt_subvol, Opt_subvolid, Opt_device, Opt_nodatasum, 67 Opt_degraded, Opt_subvol, Opt_subvolid, Opt_device, Opt_nodatasum,
67 Opt_nodatacow, Opt_max_extent, Opt_max_inline, Opt_alloc_start, 68 Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd,
68 Opt_nobarrier, Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, 69 Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
69 Opt_noacl, Opt_compress, Opt_compress_force, Opt_notreelog, Opt_ratio, 70 Opt_compress_force, Opt_notreelog, Opt_ratio, Opt_flushoncommit,
70 Opt_flushoncommit,
71 Opt_discard, Opt_err, 71 Opt_discard, Opt_err,
72}; 72};
73 73
@@ -79,7 +79,6 @@ static match_table_t tokens = {
79 {Opt_nodatasum, "nodatasum"}, 79 {Opt_nodatasum, "nodatasum"},
80 {Opt_nodatacow, "nodatacow"}, 80 {Opt_nodatacow, "nodatacow"},
81 {Opt_nobarrier, "nobarrier"}, 81 {Opt_nobarrier, "nobarrier"},
82 {Opt_max_extent, "max_extent=%s"},
83 {Opt_max_inline, "max_inline=%s"}, 82 {Opt_max_inline, "max_inline=%s"},
84 {Opt_alloc_start, "alloc_start=%s"}, 83 {Opt_alloc_start, "alloc_start=%s"},
85 {Opt_thread_pool, "thread_pool=%d"}, 84 {Opt_thread_pool, "thread_pool=%d"},
@@ -188,18 +187,6 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
188 info->thread_pool_size); 187 info->thread_pool_size);
189 } 188 }
190 break; 189 break;
191 case Opt_max_extent:
192 num = match_strdup(&args[0]);
193 if (num) {
194 info->max_extent = memparse(num, NULL);
195 kfree(num);
196
197 info->max_extent = max_t(u64,
198 info->max_extent, root->sectorsize);
199 printk(KERN_INFO "btrfs: max_extent at %llu\n",
200 (unsigned long long)info->max_extent);
201 }
202 break;
203 case Opt_max_inline: 190 case Opt_max_inline:
204 num = match_strdup(&args[0]); 191 num = match_strdup(&args[0]);
205 if (num) { 192 if (num) {
@@ -511,7 +498,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
511 btrfs_start_delalloc_inodes(root, 0); 498 btrfs_start_delalloc_inodes(root, 0);
512 btrfs_wait_ordered_extents(root, 0, 0); 499 btrfs_wait_ordered_extents(root, 0, 0);
513 500
514 trans = btrfs_start_transaction(root, 1); 501 trans = btrfs_start_transaction(root, 0);
515 ret = btrfs_commit_transaction(trans, root); 502 ret = btrfs_commit_transaction(trans, root);
516 return ret; 503 return ret;
517} 504}
@@ -529,9 +516,6 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
529 seq_puts(seq, ",nodatacow"); 516 seq_puts(seq, ",nodatacow");
530 if (btrfs_test_opt(root, NOBARRIER)) 517 if (btrfs_test_opt(root, NOBARRIER))
531 seq_puts(seq, ",nobarrier"); 518 seq_puts(seq, ",nobarrier");
532 if (info->max_extent != (u64)-1)
533 seq_printf(seq, ",max_extent=%llu",
534 (unsigned long long)info->max_extent);
535 if (info->max_inline != 8192 * 1024) 519 if (info->max_inline != 8192 * 1024)
536 seq_printf(seq, ",max_inline=%llu", 520 seq_printf(seq, ",max_inline=%llu",
537 (unsigned long long)info->max_inline); 521 (unsigned long long)info->max_inline);
@@ -710,11 +694,11 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
710 if (btrfs_super_log_root(&root->fs_info->super_copy) != 0) 694 if (btrfs_super_log_root(&root->fs_info->super_copy) != 0)
711 return -EINVAL; 695 return -EINVAL;
712 696
713 /* recover relocation */ 697 ret = btrfs_cleanup_fs_roots(root->fs_info);
714 ret = btrfs_recover_relocation(root);
715 WARN_ON(ret); 698 WARN_ON(ret);
716 699
717 ret = btrfs_cleanup_fs_roots(root->fs_info); 700 /* recover relocation */
701 ret = btrfs_recover_relocation(root);
718 WARN_ON(ret); 702 WARN_ON(ret);
719 703
720 sb->s_flags &= ~MS_RDONLY; 704 sb->s_flags &= ~MS_RDONLY;
@@ -730,34 +714,18 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
730 struct list_head *head = &root->fs_info->space_info; 714 struct list_head *head = &root->fs_info->space_info;
731 struct btrfs_space_info *found; 715 struct btrfs_space_info *found;
732 u64 total_used = 0; 716 u64 total_used = 0;
733 u64 data_used = 0;
734 int bits = dentry->d_sb->s_blocksize_bits; 717 int bits = dentry->d_sb->s_blocksize_bits;
735 __be32 *fsid = (__be32 *)root->fs_info->fsid; 718 __be32 *fsid = (__be32 *)root->fs_info->fsid;
736 719
737 rcu_read_lock(); 720 rcu_read_lock();
738 list_for_each_entry_rcu(found, head, list) { 721 list_for_each_entry_rcu(found, head, list)
739 if (found->flags & (BTRFS_BLOCK_GROUP_DUP| 722 total_used += found->disk_used;
740 BTRFS_BLOCK_GROUP_RAID10|
741 BTRFS_BLOCK_GROUP_RAID1)) {
742 total_used += found->bytes_used;
743 if (found->flags & BTRFS_BLOCK_GROUP_DATA)
744 data_used += found->bytes_used;
745 else
746 data_used += found->total_bytes;
747 }
748
749 total_used += found->bytes_used;
750 if (found->flags & BTRFS_BLOCK_GROUP_DATA)
751 data_used += found->bytes_used;
752 else
753 data_used += found->total_bytes;
754 }
755 rcu_read_unlock(); 723 rcu_read_unlock();
756 724
757 buf->f_namelen = BTRFS_NAME_LEN; 725 buf->f_namelen = BTRFS_NAME_LEN;
758 buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits; 726 buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
759 buf->f_bfree = buf->f_blocks - (total_used >> bits); 727 buf->f_bfree = buf->f_blocks - (total_used >> bits);
760 buf->f_bavail = buf->f_blocks - (data_used >> bits); 728 buf->f_bavail = buf->f_bfree;
761 buf->f_bsize = dentry->d_sb->s_blocksize; 729 buf->f_bsize = dentry->d_sb->s_blocksize;
762 buf->f_type = BTRFS_SUPER_MAGIC; 730 buf->f_type = BTRFS_SUPER_MAGIC;
763 731
@@ -848,11 +816,14 @@ static const struct file_operations btrfs_ctl_fops = {
848}; 816};
849 817
850static struct miscdevice btrfs_misc = { 818static struct miscdevice btrfs_misc = {
851 .minor = MISC_DYNAMIC_MINOR, 819 .minor = BTRFS_MINOR,
852 .name = "btrfs-control", 820 .name = "btrfs-control",
853 .fops = &btrfs_ctl_fops 821 .fops = &btrfs_ctl_fops
854}; 822};
855 823
824MODULE_ALIAS_MISCDEV(BTRFS_MINOR);
825MODULE_ALIAS("devname:btrfs-control");
826
856static int btrfs_interface_init(void) 827static int btrfs_interface_init(void)
857{ 828{
858 return misc_register(&btrfs_misc); 829 return misc_register(&btrfs_misc);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 2d654c1c794d..66e4c66cc63b 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/fs.h> 19#include <linux/fs.h>
20#include <linux/slab.h>
20#include <linux/sched.h> 21#include <linux/sched.h>
21#include <linux/writeback.h> 22#include <linux/writeback.h>
22#include <linux/pagemap.h> 23#include <linux/pagemap.h>
@@ -147,18 +148,13 @@ static void wait_current_trans(struct btrfs_root *root)
147 while (1) { 148 while (1) {
148 prepare_to_wait(&root->fs_info->transaction_wait, &wait, 149 prepare_to_wait(&root->fs_info->transaction_wait, &wait,
149 TASK_UNINTERRUPTIBLE); 150 TASK_UNINTERRUPTIBLE);
150 if (cur_trans->blocked) { 151 if (!cur_trans->blocked)
151 mutex_unlock(&root->fs_info->trans_mutex);
152 schedule();
153 mutex_lock(&root->fs_info->trans_mutex);
154 finish_wait(&root->fs_info->transaction_wait,
155 &wait);
156 } else {
157 finish_wait(&root->fs_info->transaction_wait,
158 &wait);
159 break; 152 break;
160 } 153 mutex_unlock(&root->fs_info->trans_mutex);
154 schedule();
155 mutex_lock(&root->fs_info->trans_mutex);
161 } 156 }
157 finish_wait(&root->fs_info->transaction_wait, &wait);
162 put_transaction(cur_trans); 158 put_transaction(cur_trans);
163 } 159 }
164} 160}
@@ -169,54 +165,89 @@ enum btrfs_trans_type {
169 TRANS_USERSPACE, 165 TRANS_USERSPACE,
170}; 166};
171 167
168static int may_wait_transaction(struct btrfs_root *root, int type)
169{
170 if (!root->fs_info->log_root_recovering &&
171 ((type == TRANS_START && !root->fs_info->open_ioctl_trans) ||
172 type == TRANS_USERSPACE))
173 return 1;
174 return 0;
175}
176
172static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, 177static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
173 int num_blocks, int type) 178 u64 num_items, int type)
174{ 179{
175 struct btrfs_trans_handle *h = 180 struct btrfs_trans_handle *h;
176 kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); 181 struct btrfs_transaction *cur_trans;
182 int retries = 0;
177 int ret; 183 int ret;
184again:
185 h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
186 if (!h)
187 return ERR_PTR(-ENOMEM);
178 188
179 mutex_lock(&root->fs_info->trans_mutex); 189 mutex_lock(&root->fs_info->trans_mutex);
180 if (!root->fs_info->log_root_recovering && 190 if (may_wait_transaction(root, type))
181 ((type == TRANS_START && !root->fs_info->open_ioctl_trans) ||
182 type == TRANS_USERSPACE))
183 wait_current_trans(root); 191 wait_current_trans(root);
192
184 ret = join_transaction(root); 193 ret = join_transaction(root);
185 BUG_ON(ret); 194 BUG_ON(ret);
186 195
187 h->transid = root->fs_info->running_transaction->transid; 196 cur_trans = root->fs_info->running_transaction;
188 h->transaction = root->fs_info->running_transaction; 197 cur_trans->use_count++;
189 h->blocks_reserved = num_blocks; 198 mutex_unlock(&root->fs_info->trans_mutex);
199
200 h->transid = cur_trans->transid;
201 h->transaction = cur_trans;
190 h->blocks_used = 0; 202 h->blocks_used = 0;
191 h->block_group = 0; 203 h->block_group = 0;
192 h->alloc_exclude_nr = 0; 204 h->bytes_reserved = 0;
193 h->alloc_exclude_start = 0;
194 h->delayed_ref_updates = 0; 205 h->delayed_ref_updates = 0;
206 h->block_rsv = NULL;
195 207
196 if (!current->journal_info && type != TRANS_USERSPACE) 208 smp_mb();
197 current->journal_info = h; 209 if (cur_trans->blocked && may_wait_transaction(root, type)) {
210 btrfs_commit_transaction(h, root);
211 goto again;
212 }
213
214 if (num_items > 0) {
215 ret = btrfs_trans_reserve_metadata(h, root, num_items,
216 &retries);
217 if (ret == -EAGAIN) {
218 btrfs_commit_transaction(h, root);
219 goto again;
220 }
221 if (ret < 0) {
222 btrfs_end_transaction(h, root);
223 return ERR_PTR(ret);
224 }
225 }
198 226
199 root->fs_info->running_transaction->use_count++; 227 mutex_lock(&root->fs_info->trans_mutex);
200 record_root_in_trans(h, root); 228 record_root_in_trans(h, root);
201 mutex_unlock(&root->fs_info->trans_mutex); 229 mutex_unlock(&root->fs_info->trans_mutex);
230
231 if (!current->journal_info && type != TRANS_USERSPACE)
232 current->journal_info = h;
202 return h; 233 return h;
203} 234}
204 235
205struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, 236struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
206 int num_blocks) 237 int num_items)
207{ 238{
208 return start_transaction(root, num_blocks, TRANS_START); 239 return start_transaction(root, num_items, TRANS_START);
209} 240}
210struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root, 241struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
211 int num_blocks) 242 int num_blocks)
212{ 243{
213 return start_transaction(root, num_blocks, TRANS_JOIN); 244 return start_transaction(root, 0, TRANS_JOIN);
214} 245}
215 246
216struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, 247struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
217 int num_blocks) 248 int num_blocks)
218{ 249{
219 return start_transaction(r, num_blocks, TRANS_USERSPACE); 250 return start_transaction(r, 0, TRANS_USERSPACE);
220} 251}
221 252
222/* wait for a transaction commit to be fully complete */ 253/* wait for a transaction commit to be fully complete */
@@ -290,10 +321,36 @@ void btrfs_throttle(struct btrfs_root *root)
290 mutex_unlock(&root->fs_info->trans_mutex); 321 mutex_unlock(&root->fs_info->trans_mutex);
291} 322}
292 323
324static int should_end_transaction(struct btrfs_trans_handle *trans,
325 struct btrfs_root *root)
326{
327 int ret;
328 ret = btrfs_block_rsv_check(trans, root,
329 &root->fs_info->global_block_rsv, 0, 5);
330 return ret ? 1 : 0;
331}
332
333int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
334 struct btrfs_root *root)
335{
336 struct btrfs_transaction *cur_trans = trans->transaction;
337 int updates;
338
339 if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
340 return 1;
341
342 updates = trans->delayed_ref_updates;
343 trans->delayed_ref_updates = 0;
344 if (updates)
345 btrfs_run_delayed_refs(trans, root, updates);
346
347 return should_end_transaction(trans, root);
348}
349
293static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, 350static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
294 struct btrfs_root *root, int throttle) 351 struct btrfs_root *root, int throttle)
295{ 352{
296 struct btrfs_transaction *cur_trans; 353 struct btrfs_transaction *cur_trans = trans->transaction;
297 struct btrfs_fs_info *info = root->fs_info; 354 struct btrfs_fs_info *info = root->fs_info;
298 int count = 0; 355 int count = 0;
299 356
@@ -317,9 +374,21 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
317 count++; 374 count++;
318 } 375 }
319 376
377 btrfs_trans_release_metadata(trans, root);
378
379 if (!root->fs_info->open_ioctl_trans &&
380 should_end_transaction(trans, root))
381 trans->transaction->blocked = 1;
382
383 if (cur_trans->blocked && !cur_trans->in_commit) {
384 if (throttle)
385 return btrfs_commit_transaction(trans, root);
386 else
387 wake_up_process(info->transaction_kthread);
388 }
389
320 mutex_lock(&info->trans_mutex); 390 mutex_lock(&info->trans_mutex);
321 cur_trans = info->running_transaction; 391 WARN_ON(cur_trans != info->running_transaction);
322 WARN_ON(cur_trans != trans->transaction);
323 WARN_ON(cur_trans->num_writers < 1); 392 WARN_ON(cur_trans->num_writers < 1);
324 cur_trans->num_writers--; 393 cur_trans->num_writers--;
325 394
@@ -607,6 +676,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
607 676
608 btrfs_free_log(trans, root); 677 btrfs_free_log(trans, root);
609 btrfs_update_reloc_root(trans, root); 678 btrfs_update_reloc_root(trans, root);
679 btrfs_orphan_commit_root(trans, root);
610 680
611 if (root->commit_root != root->node) { 681 if (root->commit_root != root->node) {
612 switch_commit_root(root); 682 switch_commit_root(root);
@@ -631,30 +701,30 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
631int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) 701int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
632{ 702{
633 struct btrfs_fs_info *info = root->fs_info; 703 struct btrfs_fs_info *info = root->fs_info;
634 int ret;
635 struct btrfs_trans_handle *trans; 704 struct btrfs_trans_handle *trans;
705 int ret;
636 unsigned long nr; 706 unsigned long nr;
637 707
638 smp_mb(); 708 if (xchg(&root->defrag_running, 1))
639 if (root->defrag_running)
640 return 0; 709 return 0;
641 trans = btrfs_start_transaction(root, 1); 710
642 while (1) { 711 while (1) {
643 root->defrag_running = 1; 712 trans = btrfs_start_transaction(root, 0);
713 if (IS_ERR(trans))
714 return PTR_ERR(trans);
715
644 ret = btrfs_defrag_leaves(trans, root, cacheonly); 716 ret = btrfs_defrag_leaves(trans, root, cacheonly);
717
645 nr = trans->blocks_used; 718 nr = trans->blocks_used;
646 btrfs_end_transaction(trans, root); 719 btrfs_end_transaction(trans, root);
647 btrfs_btree_balance_dirty(info->tree_root, nr); 720 btrfs_btree_balance_dirty(info->tree_root, nr);
648 cond_resched(); 721 cond_resched();
649 722
650 trans = btrfs_start_transaction(root, 1);
651 if (root->fs_info->closing || ret != -EAGAIN) 723 if (root->fs_info->closing || ret != -EAGAIN)
652 break; 724 break;
653 } 725 }
654 root->defrag_running = 0; 726 root->defrag_running = 0;
655 smp_mb(); 727 return ret;
656 btrfs_end_transaction(trans, root);
657 return 0;
658} 728}
659 729
660#if 0 730#if 0
@@ -760,29 +830,72 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
760 struct btrfs_root_item *new_root_item; 830 struct btrfs_root_item *new_root_item;
761 struct btrfs_root *tree_root = fs_info->tree_root; 831 struct btrfs_root *tree_root = fs_info->tree_root;
762 struct btrfs_root *root = pending->root; 832 struct btrfs_root *root = pending->root;
833 struct btrfs_root *parent_root;
834 struct inode *parent_inode;
835 struct dentry *dentry;
763 struct extent_buffer *tmp; 836 struct extent_buffer *tmp;
764 struct extent_buffer *old; 837 struct extent_buffer *old;
765 int ret; 838 int ret;
839 int retries = 0;
840 u64 to_reserve = 0;
841 u64 index = 0;
766 u64 objectid; 842 u64 objectid;
767 843
768 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); 844 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
769 if (!new_root_item) { 845 if (!new_root_item) {
770 ret = -ENOMEM; 846 pending->error = -ENOMEM;
771 goto fail; 847 goto fail;
772 } 848 }
849
773 ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid); 850 ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid);
774 if (ret) 851 if (ret) {
852 pending->error = ret;
775 goto fail; 853 goto fail;
854 }
855
856 btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
857 btrfs_orphan_pre_snapshot(trans, pending, &to_reserve);
858
859 if (to_reserve > 0) {
860 ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv,
861 to_reserve, &retries);
862 if (ret) {
863 pending->error = ret;
864 goto fail;
865 }
866 }
867
868 key.objectid = objectid;
869 key.offset = (u64)-1;
870 key.type = BTRFS_ROOT_ITEM_KEY;
871
872 trans->block_rsv = &pending->block_rsv;
873
874 dentry = pending->dentry;
875 parent_inode = dentry->d_parent->d_inode;
876 parent_root = BTRFS_I(parent_inode)->root;
877 record_root_in_trans(trans, parent_root);
878
879 /*
880 * insert the directory item
881 */
882 ret = btrfs_set_inode_index(parent_inode, &index);
883 BUG_ON(ret);
884 ret = btrfs_insert_dir_item(trans, parent_root,
885 dentry->d_name.name, dentry->d_name.len,
886 parent_inode->i_ino, &key,
887 BTRFS_FT_DIR, index);
888 BUG_ON(ret);
889
890 btrfs_i_size_write(parent_inode, parent_inode->i_size +
891 dentry->d_name.len * 2);
892 ret = btrfs_update_inode(trans, parent_root, parent_inode);
893 BUG_ON(ret);
776 894
777 record_root_in_trans(trans, root); 895 record_root_in_trans(trans, root);
778 btrfs_set_root_last_snapshot(&root->root_item, trans->transid); 896 btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
779 memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); 897 memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
780 898
781 key.objectid = objectid;
782 /* record when the snapshot was created in key.offset */
783 key.offset = trans->transid;
784 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
785
786 old = btrfs_lock_root_node(root); 899 old = btrfs_lock_root_node(root);
787 btrfs_cow_block(trans, root, old, NULL, 0, &old); 900 btrfs_cow_block(trans, root, old, NULL, 0, &old);
788 btrfs_set_lock_blocking(old); 901 btrfs_set_lock_blocking(old);
@@ -792,62 +905,32 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
792 free_extent_buffer(old); 905 free_extent_buffer(old);
793 906
794 btrfs_set_root_node(new_root_item, tmp); 907 btrfs_set_root_node(new_root_item, tmp);
795 ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, 908 /* record when the snapshot was created in key.offset */
796 new_root_item); 909 key.offset = trans->transid;
910 ret = btrfs_insert_root(trans, tree_root, &key, new_root_item);
797 btrfs_tree_unlock(tmp); 911 btrfs_tree_unlock(tmp);
798 free_extent_buffer(tmp); 912 free_extent_buffer(tmp);
799 if (ret) 913 BUG_ON(ret);
800 goto fail;
801
802 key.offset = (u64)-1;
803 memcpy(&pending->root_key, &key, sizeof(key));
804fail:
805 kfree(new_root_item);
806 return ret;
807}
808
809static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info,
810 struct btrfs_pending_snapshot *pending)
811{
812 int ret;
813 int namelen;
814 u64 index = 0;
815 struct btrfs_trans_handle *trans;
816 struct inode *parent_inode;
817 struct btrfs_root *parent_root;
818
819 parent_inode = pending->dentry->d_parent->d_inode;
820 parent_root = BTRFS_I(parent_inode)->root;
821 trans = btrfs_join_transaction(parent_root, 1);
822 914
823 /* 915 /*
824 * insert the directory item 916 * insert root back/forward references
825 */ 917 */
826 namelen = strlen(pending->name); 918 ret = btrfs_add_root_ref(trans, tree_root, objectid,
827 ret = btrfs_set_inode_index(parent_inode, &index);
828 ret = btrfs_insert_dir_item(trans, parent_root,
829 pending->name, namelen,
830 parent_inode->i_ino,
831 &pending->root_key, BTRFS_FT_DIR, index);
832
833 if (ret)
834 goto fail;
835
836 btrfs_i_size_write(parent_inode, parent_inode->i_size + namelen * 2);
837 ret = btrfs_update_inode(trans, parent_root, parent_inode);
838 BUG_ON(ret);
839
840 ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
841 pending->root_key.objectid,
842 parent_root->root_key.objectid, 919 parent_root->root_key.objectid,
843 parent_inode->i_ino, index, pending->name, 920 parent_inode->i_ino, index,
844 namelen); 921 dentry->d_name.name, dentry->d_name.len);
845
846 BUG_ON(ret); 922 BUG_ON(ret);
847 923
924 key.offset = (u64)-1;
925 pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key);
926 BUG_ON(IS_ERR(pending->snap));
927
928 btrfs_reloc_post_snapshot(trans, pending);
929 btrfs_orphan_post_snapshot(trans, pending);
848fail: 930fail:
849 btrfs_end_transaction(trans, fs_info->fs_root); 931 kfree(new_root_item);
850 return ret; 932 btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1);
933 return 0;
851} 934}
852 935
853/* 936/*
@@ -867,25 +950,6 @@ static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
867 return 0; 950 return 0;
868} 951}
869 952
870static noinline int finish_pending_snapshots(struct btrfs_trans_handle *trans,
871 struct btrfs_fs_info *fs_info)
872{
873 struct btrfs_pending_snapshot *pending;
874 struct list_head *head = &trans->transaction->pending_snapshots;
875 int ret;
876
877 while (!list_empty(head)) {
878 pending = list_entry(head->next,
879 struct btrfs_pending_snapshot, list);
880 ret = finish_pending_snapshot(fs_info, pending);
881 BUG_ON(ret);
882 list_del(&pending->list);
883 kfree(pending->name);
884 kfree(pending);
885 }
886 return 0;
887}
888
889static void update_super_roots(struct btrfs_root *root) 953static void update_super_roots(struct btrfs_root *root)
890{ 954{
891 struct btrfs_root_item *root_item; 955 struct btrfs_root_item *root_item;
@@ -914,6 +978,16 @@ int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
914 return ret; 978 return ret;
915} 979}
916 980
981int btrfs_transaction_blocked(struct btrfs_fs_info *info)
982{
983 int ret = 0;
984 spin_lock(&info->new_trans_lock);
985 if (info->running_transaction)
986 ret = info->running_transaction->blocked;
987 spin_unlock(&info->new_trans_lock);
988 return ret;
989}
990
917int btrfs_commit_transaction(struct btrfs_trans_handle *trans, 991int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
918 struct btrfs_root *root) 992 struct btrfs_root *root)
919{ 993{
@@ -935,6 +1009,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
935 ret = btrfs_run_delayed_refs(trans, root, 0); 1009 ret = btrfs_run_delayed_refs(trans, root, 0);
936 BUG_ON(ret); 1010 BUG_ON(ret);
937 1011
1012 btrfs_trans_release_metadata(trans, root);
1013
938 cur_trans = trans->transaction; 1014 cur_trans = trans->transaction;
939 /* 1015 /*
940 * set the flushing flag so procs in this transaction have to 1016 * set the flushing flag so procs in this transaction have to
@@ -987,9 +1063,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
987 snap_pending = 1; 1063 snap_pending = 1;
988 1064
989 WARN_ON(cur_trans != trans->transaction); 1065 WARN_ON(cur_trans != trans->transaction);
990 prepare_to_wait(&cur_trans->writer_wait, &wait,
991 TASK_UNINTERRUPTIBLE);
992
993 if (cur_trans->num_writers > 1) 1066 if (cur_trans->num_writers > 1)
994 timeout = MAX_SCHEDULE_TIMEOUT; 1067 timeout = MAX_SCHEDULE_TIMEOUT;
995 else if (should_grow) 1068 else if (should_grow)
@@ -1012,6 +1085,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1012 */ 1085 */
1013 btrfs_run_ordered_operations(root, 1); 1086 btrfs_run_ordered_operations(root, 1);
1014 1087
1088 prepare_to_wait(&cur_trans->writer_wait, &wait,
1089 TASK_UNINTERRUPTIBLE);
1090
1015 smp_mb(); 1091 smp_mb();
1016 if (cur_trans->num_writers > 1 || should_grow) 1092 if (cur_trans->num_writers > 1 || should_grow)
1017 schedule_timeout(timeout); 1093 schedule_timeout(timeout);
@@ -1097,9 +1173,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1097 1173
1098 btrfs_finish_extent_commit(trans, root); 1174 btrfs_finish_extent_commit(trans, root);
1099 1175
1100 /* do the directory inserts of any pending snapshot creations */
1101 finish_pending_snapshots(trans, root->fs_info);
1102
1103 mutex_lock(&root->fs_info->trans_mutex); 1176 mutex_lock(&root->fs_info->trans_mutex);
1104 1177
1105 cur_trans->commit_done = 1; 1178 cur_trans->commit_done = 1;
@@ -1142,9 +1215,9 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root)
1142 1215
1143 if (btrfs_header_backref_rev(root->node) < 1216 if (btrfs_header_backref_rev(root->node) <
1144 BTRFS_MIXED_BACKREF_REV) 1217 BTRFS_MIXED_BACKREF_REV)
1145 btrfs_drop_snapshot(root, 0); 1218 btrfs_drop_snapshot(root, NULL, 0);
1146 else 1219 else
1147 btrfs_drop_snapshot(root, 1); 1220 btrfs_drop_snapshot(root, NULL, 1);
1148 } 1221 }
1149 return 0; 1222 return 0;
1150} 1223}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 93c7ccb33118..e104986d0bfd 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -45,20 +45,23 @@ struct btrfs_transaction {
45 45
46struct btrfs_trans_handle { 46struct btrfs_trans_handle {
47 u64 transid; 47 u64 transid;
48 u64 block_group;
49 u64 bytes_reserved;
48 unsigned long blocks_reserved; 50 unsigned long blocks_reserved;
49 unsigned long blocks_used; 51 unsigned long blocks_used;
50 struct btrfs_transaction *transaction;
51 u64 block_group;
52 u64 alloc_exclude_start;
53 u64 alloc_exclude_nr;
54 unsigned long delayed_ref_updates; 52 unsigned long delayed_ref_updates;
53 struct btrfs_transaction *transaction;
54 struct btrfs_block_rsv *block_rsv;
55}; 55};
56 56
57struct btrfs_pending_snapshot { 57struct btrfs_pending_snapshot {
58 struct dentry *dentry; 58 struct dentry *dentry;
59 struct btrfs_root *root; 59 struct btrfs_root *root;
60 char *name; 60 struct btrfs_root *snap;
61 struct btrfs_key root_key; 61 /* block reservation for the operation */
62 struct btrfs_block_rsv block_rsv;
63 /* extra metadata reseration for relocation */
64 int error;
62 struct list_head list; 65 struct list_head list;
63}; 66};
64 67
@@ -85,11 +88,11 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
85int btrfs_end_transaction(struct btrfs_trans_handle *trans, 88int btrfs_end_transaction(struct btrfs_trans_handle *trans,
86 struct btrfs_root *root); 89 struct btrfs_root *root);
87struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, 90struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
88 int num_blocks); 91 int num_items);
89struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root, 92struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
90 int num_blocks); 93 int num_blocks);
91struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, 94struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
92 int num_blocks); 95 int num_blocks);
93int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, 96int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
94 struct btrfs_root *root); 97 struct btrfs_root *root);
95int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans, 98int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
@@ -103,6 +106,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
103 struct btrfs_root *root); 106 struct btrfs_root *root);
104int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, 107int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
105 struct btrfs_root *root); 108 struct btrfs_root *root);
109int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
110 struct btrfs_root *root);
106void btrfs_throttle(struct btrfs_root *root); 111void btrfs_throttle(struct btrfs_root *root);
107int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, 112int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
108 struct btrfs_root *root); 113 struct btrfs_root *root);
@@ -112,5 +117,6 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
112 struct extent_io_tree *dirty_pages, int mark); 117 struct extent_io_tree *dirty_pages, int mark);
113int btrfs_wait_marked_extents(struct btrfs_root *root, 118int btrfs_wait_marked_extents(struct btrfs_root *root,
114 struct extent_io_tree *dirty_pages, int mark); 119 struct extent_io_tree *dirty_pages, int mark);
120int btrfs_transaction_blocked(struct btrfs_fs_info *info);
115int btrfs_transaction_in_commit(struct btrfs_fs_info *info); 121int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
116#endif 122#endif
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index b10eacdb1620..f7ac8e013ed7 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -117,13 +117,14 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
117 path->nodes[1], 0, 117 path->nodes[1], 0,
118 cache_only, &last_ret, 118 cache_only, &last_ret,
119 &root->defrag_progress); 119 &root->defrag_progress);
120 WARN_ON(ret && ret != -EAGAIN); 120 if (ret) {
121 WARN_ON(ret == -EAGAIN);
122 goto out;
123 }
121 if (next_key_ret == 0) { 124 if (next_key_ret == 0) {
122 memcpy(&root->defrag_progress, &key, sizeof(key)); 125 memcpy(&root->defrag_progress, &key, sizeof(key));
123 ret = -EAGAIN; 126 ret = -EAGAIN;
124 } 127 }
125
126 btrfs_release_path(root, path);
127out: 128out:
128 if (path) 129 if (path)
129 btrfs_free_path(path); 130 btrfs_free_path(path);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 1255fcc8ade5..fb102a9aee9c 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/slab.h>
20#include "ctree.h" 21#include "ctree.h"
21#include "transaction.h" 22#include "transaction.h"
22#include "disk-io.h" 23#include "disk-io.h"
@@ -134,6 +135,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
134 struct btrfs_root *root) 135 struct btrfs_root *root)
135{ 136{
136 int ret; 137 int ret;
138 int err = 0;
137 139
138 mutex_lock(&root->log_mutex); 140 mutex_lock(&root->log_mutex);
139 if (root->log_root) { 141 if (root->log_root) {
@@ -154,17 +156,19 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
154 mutex_lock(&root->fs_info->tree_log_mutex); 156 mutex_lock(&root->fs_info->tree_log_mutex);
155 if (!root->fs_info->log_root_tree) { 157 if (!root->fs_info->log_root_tree) {
156 ret = btrfs_init_log_root_tree(trans, root->fs_info); 158 ret = btrfs_init_log_root_tree(trans, root->fs_info);
157 BUG_ON(ret); 159 if (ret)
160 err = ret;
158 } 161 }
159 if (!root->log_root) { 162 if (err == 0 && !root->log_root) {
160 ret = btrfs_add_log_tree(trans, root); 163 ret = btrfs_add_log_tree(trans, root);
161 BUG_ON(ret); 164 if (ret)
165 err = ret;
162 } 166 }
163 mutex_unlock(&root->fs_info->tree_log_mutex); 167 mutex_unlock(&root->fs_info->tree_log_mutex);
164 root->log_batch++; 168 root->log_batch++;
165 atomic_inc(&root->log_writers); 169 atomic_inc(&root->log_writers);
166 mutex_unlock(&root->log_mutex); 170 mutex_unlock(&root->log_mutex);
167 return 0; 171 return err;
168} 172}
169 173
170/* 174/*
@@ -375,7 +379,7 @@ insert:
375 BUG_ON(ret); 379 BUG_ON(ret);
376 } 380 }
377 } else if (ret) { 381 } else if (ret) {
378 BUG(); 382 return ret;
379 } 383 }
380 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], 384 dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
381 path->slots[0]); 385 path->slots[0]);
@@ -1698,9 +1702,9 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1698 1702
1699 next = btrfs_find_create_tree_block(root, bytenr, blocksize); 1703 next = btrfs_find_create_tree_block(root, bytenr, blocksize);
1700 1704
1701 wc->process_func(root, next, wc, ptr_gen);
1702
1703 if (*level == 1) { 1705 if (*level == 1) {
1706 wc->process_func(root, next, wc, ptr_gen);
1707
1704 path->slots[*level]++; 1708 path->slots[*level]++;
1705 if (wc->free) { 1709 if (wc->free) {
1706 btrfs_read_buffer(next, ptr_gen); 1710 btrfs_read_buffer(next, ptr_gen);
@@ -1733,35 +1737,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1733 WARN_ON(*level < 0); 1737 WARN_ON(*level < 0);
1734 WARN_ON(*level >= BTRFS_MAX_LEVEL); 1738 WARN_ON(*level >= BTRFS_MAX_LEVEL);
1735 1739
1736 if (path->nodes[*level] == root->node) 1740 path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
1737 parent = path->nodes[*level];
1738 else
1739 parent = path->nodes[*level + 1];
1740
1741 bytenr = path->nodes[*level]->start;
1742
1743 blocksize = btrfs_level_size(root, *level);
1744 root_owner = btrfs_header_owner(parent);
1745 root_gen = btrfs_header_generation(parent);
1746
1747 wc->process_func(root, path->nodes[*level], wc,
1748 btrfs_header_generation(path->nodes[*level]));
1749
1750 if (wc->free) {
1751 next = path->nodes[*level];
1752 btrfs_tree_lock(next);
1753 clean_tree_block(trans, root, next);
1754 btrfs_set_lock_blocking(next);
1755 btrfs_wait_tree_block_writeback(next);
1756 btrfs_tree_unlock(next);
1757
1758 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
1759 ret = btrfs_free_reserved_extent(root, bytenr, blocksize);
1760 BUG_ON(ret);
1761 }
1762 free_extent_buffer(path->nodes[*level]);
1763 path->nodes[*level] = NULL;
1764 *level += 1;
1765 1741
1766 cond_resched(); 1742 cond_resched();
1767 return 0; 1743 return 0;
@@ -1780,7 +1756,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
1780 1756
1781 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { 1757 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
1782 slot = path->slots[i]; 1758 slot = path->slots[i];
1783 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) { 1759 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
1784 struct extent_buffer *node; 1760 struct extent_buffer *node;
1785 node = path->nodes[i]; 1761 node = path->nodes[i];
1786 path->slots[i]++; 1762 path->slots[i]++;
@@ -2046,7 +2022,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2046 mutex_unlock(&log_root_tree->log_mutex); 2022 mutex_unlock(&log_root_tree->log_mutex);
2047 2023
2048 ret = update_log_root(trans, log); 2024 ret = update_log_root(trans, log);
2049 BUG_ON(ret);
2050 2025
2051 mutex_lock(&log_root_tree->log_mutex); 2026 mutex_lock(&log_root_tree->log_mutex);
2052 if (atomic_dec_and_test(&log_root_tree->log_writers)) { 2027 if (atomic_dec_and_test(&log_root_tree->log_writers)) {
@@ -2055,6 +2030,15 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2055 wake_up(&log_root_tree->log_writer_wait); 2030 wake_up(&log_root_tree->log_writer_wait);
2056 } 2031 }
2057 2032
2033 if (ret) {
2034 BUG_ON(ret != -ENOSPC);
2035 root->fs_info->last_trans_log_full_commit = trans->transid;
2036 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2037 mutex_unlock(&log_root_tree->log_mutex);
2038 ret = -EAGAIN;
2039 goto out;
2040 }
2041
2058 index2 = log_root_tree->log_transid % 2; 2042 index2 = log_root_tree->log_transid % 2;
2059 if (atomic_read(&log_root_tree->log_commit[index2])) { 2043 if (atomic_read(&log_root_tree->log_commit[index2])) {
2060 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2044 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
@@ -2128,15 +2112,10 @@ out:
2128 return 0; 2112 return 0;
2129} 2113}
2130 2114
2131/* 2115static void free_log_tree(struct btrfs_trans_handle *trans,
2132 * free all the extents used by the tree log. This should be called 2116 struct btrfs_root *log)
2133 * at commit time of the full transaction
2134 */
2135int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
2136{ 2117{
2137 int ret; 2118 int ret;
2138 struct btrfs_root *log;
2139 struct key;
2140 u64 start; 2119 u64 start;
2141 u64 end; 2120 u64 end;
2142 struct walk_control wc = { 2121 struct walk_control wc = {
@@ -2144,10 +2123,6 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
2144 .process_func = process_one_buffer 2123 .process_func = process_one_buffer
2145 }; 2124 };
2146 2125
2147 if (!root->log_root || root->fs_info->log_root_recovering)
2148 return 0;
2149
2150 log = root->log_root;
2151 ret = walk_log_tree(trans, log, &wc); 2126 ret = walk_log_tree(trans, log, &wc);
2152 BUG_ON(ret); 2127 BUG_ON(ret);
2153 2128
@@ -2161,14 +2136,30 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
2161 EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS); 2136 EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
2162 } 2137 }
2163 2138
2164 if (log->log_transid > 0) {
2165 ret = btrfs_del_root(trans, root->fs_info->log_root_tree,
2166 &log->root_key);
2167 BUG_ON(ret);
2168 }
2169 root->log_root = NULL;
2170 free_extent_buffer(log->node); 2139 free_extent_buffer(log->node);
2171 kfree(log); 2140 kfree(log);
2141}
2142
2143/*
2144 * free all the extents used by the tree log. This should be called
2145 * at commit time of the full transaction
2146 */
2147int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
2148{
2149 if (root->log_root) {
2150 free_log_tree(trans, root->log_root);
2151 root->log_root = NULL;
2152 }
2153 return 0;
2154}
2155
2156int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
2157 struct btrfs_fs_info *fs_info)
2158{
2159 if (fs_info->log_root_tree) {
2160 free_log_tree(trans, fs_info->log_root_tree);
2161 fs_info->log_root_tree = NULL;
2162 }
2172 return 0; 2163 return 0;
2173} 2164}
2174 2165
@@ -2202,6 +2193,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2202 struct btrfs_dir_item *di; 2193 struct btrfs_dir_item *di;
2203 struct btrfs_path *path; 2194 struct btrfs_path *path;
2204 int ret; 2195 int ret;
2196 int err = 0;
2205 int bytes_del = 0; 2197 int bytes_del = 0;
2206 2198
2207 if (BTRFS_I(dir)->logged_trans < trans->transid) 2199 if (BTRFS_I(dir)->logged_trans < trans->transid)
@@ -2217,7 +2209,11 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2217 path = btrfs_alloc_path(); 2209 path = btrfs_alloc_path();
2218 di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino, 2210 di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino,
2219 name, name_len, -1); 2211 name, name_len, -1);
2220 if (di && !IS_ERR(di)) { 2212 if (IS_ERR(di)) {
2213 err = PTR_ERR(di);
2214 goto fail;
2215 }
2216 if (di) {
2221 ret = btrfs_delete_one_dir_name(trans, log, path, di); 2217 ret = btrfs_delete_one_dir_name(trans, log, path, di);
2222 bytes_del += name_len; 2218 bytes_del += name_len;
2223 BUG_ON(ret); 2219 BUG_ON(ret);
@@ -2225,7 +2221,11 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2225 btrfs_release_path(log, path); 2221 btrfs_release_path(log, path);
2226 di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino, 2222 di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino,
2227 index, name, name_len, -1); 2223 index, name, name_len, -1);
2228 if (di && !IS_ERR(di)) { 2224 if (IS_ERR(di)) {
2225 err = PTR_ERR(di);
2226 goto fail;
2227 }
2228 if (di) {
2229 ret = btrfs_delete_one_dir_name(trans, log, path, di); 2229 ret = btrfs_delete_one_dir_name(trans, log, path, di);
2230 bytes_del += name_len; 2230 bytes_del += name_len;
2231 BUG_ON(ret); 2231 BUG_ON(ret);
@@ -2243,6 +2243,10 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2243 btrfs_release_path(log, path); 2243 btrfs_release_path(log, path);
2244 2244
2245 ret = btrfs_search_slot(trans, log, &key, path, 0, 1); 2245 ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
2246 if (ret < 0) {
2247 err = ret;
2248 goto fail;
2249 }
2246 if (ret == 0) { 2250 if (ret == 0) {
2247 struct btrfs_inode_item *item; 2251 struct btrfs_inode_item *item;
2248 u64 i_size; 2252 u64 i_size;
@@ -2260,9 +2264,13 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2260 ret = 0; 2264 ret = 0;
2261 btrfs_release_path(log, path); 2265 btrfs_release_path(log, path);
2262 } 2266 }
2263 2267fail:
2264 btrfs_free_path(path); 2268 btrfs_free_path(path);
2265 mutex_unlock(&BTRFS_I(dir)->log_mutex); 2269 mutex_unlock(&BTRFS_I(dir)->log_mutex);
2270 if (ret == -ENOSPC) {
2271 root->fs_info->last_trans_log_full_commit = trans->transid;
2272 ret = 0;
2273 }
2266 btrfs_end_log_trans(root); 2274 btrfs_end_log_trans(root);
2267 2275
2268 return 0; 2276 return 0;
@@ -2290,6 +2298,10 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
2290 ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino, 2298 ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
2291 dirid, &index); 2299 dirid, &index);
2292 mutex_unlock(&BTRFS_I(inode)->log_mutex); 2300 mutex_unlock(&BTRFS_I(inode)->log_mutex);
2301 if (ret == -ENOSPC) {
2302 root->fs_info->last_trans_log_full_commit = trans->transid;
2303 ret = 0;
2304 }
2293 btrfs_end_log_trans(root); 2305 btrfs_end_log_trans(root);
2294 2306
2295 return ret; 2307 return ret;
@@ -2317,7 +2329,8 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
2317 else 2329 else
2318 key.type = BTRFS_DIR_LOG_INDEX_KEY; 2330 key.type = BTRFS_DIR_LOG_INDEX_KEY;
2319 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); 2331 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
2320 BUG_ON(ret); 2332 if (ret)
2333 return ret;
2321 2334
2322 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2335 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2323 struct btrfs_dir_log_item); 2336 struct btrfs_dir_log_item);
@@ -2342,6 +2355,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2342 struct btrfs_key max_key; 2355 struct btrfs_key max_key;
2343 struct btrfs_root *log = root->log_root; 2356 struct btrfs_root *log = root->log_root;
2344 struct extent_buffer *src; 2357 struct extent_buffer *src;
2358 int err = 0;
2345 int ret; 2359 int ret;
2346 int i; 2360 int i;
2347 int nritems; 2361 int nritems;
@@ -2404,6 +2418,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2404 ret = overwrite_item(trans, log, dst_path, 2418 ret = overwrite_item(trans, log, dst_path,
2405 path->nodes[0], path->slots[0], 2419 path->nodes[0], path->slots[0],
2406 &tmp); 2420 &tmp);
2421 if (ret) {
2422 err = ret;
2423 goto done;
2424 }
2407 } 2425 }
2408 } 2426 }
2409 btrfs_release_path(root, path); 2427 btrfs_release_path(root, path);
@@ -2431,7 +2449,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2431 goto done; 2449 goto done;
2432 ret = overwrite_item(trans, log, dst_path, src, i, 2450 ret = overwrite_item(trans, log, dst_path, src, i,
2433 &min_key); 2451 &min_key);
2434 BUG_ON(ret); 2452 if (ret) {
2453 err = ret;
2454 goto done;
2455 }
2435 } 2456 }
2436 path->slots[0] = nritems; 2457 path->slots[0] = nritems;
2437 2458
@@ -2453,22 +2474,30 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2453 ret = overwrite_item(trans, log, dst_path, 2474 ret = overwrite_item(trans, log, dst_path,
2454 path->nodes[0], path->slots[0], 2475 path->nodes[0], path->slots[0],
2455 &tmp); 2476 &tmp);
2456 2477 if (ret)
2457 BUG_ON(ret); 2478 err = ret;
2458 last_offset = tmp.offset; 2479 else
2480 last_offset = tmp.offset;
2459 goto done; 2481 goto done;
2460 } 2482 }
2461 } 2483 }
2462done: 2484done:
2463 *last_offset_ret = last_offset;
2464 btrfs_release_path(root, path); 2485 btrfs_release_path(root, path);
2465 btrfs_release_path(log, dst_path); 2486 btrfs_release_path(log, dst_path);
2466 2487
2467 /* insert the log range keys to indicate where the log is valid */ 2488 if (err == 0) {
2468 ret = insert_dir_log_key(trans, log, path, key_type, inode->i_ino, 2489 *last_offset_ret = last_offset;
2469 first_offset, last_offset); 2490 /*
2470 BUG_ON(ret); 2491 * insert the log range keys to indicate where the log
2471 return 0; 2492 * is valid
2493 */
2494 ret = insert_dir_log_key(trans, log, path, key_type,
2495 inode->i_ino, first_offset,
2496 last_offset);
2497 if (ret)
2498 err = ret;
2499 }
2500 return err;
2472} 2501}
2473 2502
2474/* 2503/*
@@ -2500,7 +2529,8 @@ again:
2500 ret = log_dir_items(trans, root, inode, path, 2529 ret = log_dir_items(trans, root, inode, path,
2501 dst_path, key_type, min_key, 2530 dst_path, key_type, min_key,
2502 &max_key); 2531 &max_key);
2503 BUG_ON(ret); 2532 if (ret)
2533 return ret;
2504 if (max_key == (u64)-1) 2534 if (max_key == (u64)-1)
2505 break; 2535 break;
2506 min_key = max_key + 1; 2536 min_key = max_key + 1;
@@ -2534,8 +2564,8 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
2534 2564
2535 while (1) { 2565 while (1) {
2536 ret = btrfs_search_slot(trans, log, &key, path, -1, 1); 2566 ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
2537 2567 BUG_ON(ret == 0);
2538 if (ret != 1) 2568 if (ret < 0)
2539 break; 2569 break;
2540 2570
2541 if (path->slots[0] == 0) 2571 if (path->slots[0] == 0)
@@ -2553,7 +2583,7 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
2553 btrfs_release_path(log, path); 2583 btrfs_release_path(log, path);
2554 } 2584 }
2555 btrfs_release_path(log, path); 2585 btrfs_release_path(log, path);
2556 return 0; 2586 return ret;
2557} 2587}
2558 2588
2559static noinline int copy_items(struct btrfs_trans_handle *trans, 2589static noinline int copy_items(struct btrfs_trans_handle *trans,
@@ -2586,7 +2616,10 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2586 } 2616 }
2587 ret = btrfs_insert_empty_items(trans, log, dst_path, 2617 ret = btrfs_insert_empty_items(trans, log, dst_path,
2588 ins_keys, ins_sizes, nr); 2618 ins_keys, ins_sizes, nr);
2589 BUG_ON(ret); 2619 if (ret) {
2620 kfree(ins_data);
2621 return ret;
2622 }
2590 2623
2591 for (i = 0; i < nr; i++, dst_path->slots[0]++) { 2624 for (i = 0; i < nr; i++, dst_path->slots[0]++) {
2592 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], 2625 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
@@ -2659,16 +2692,17 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2659 * we have to do this after the loop above to avoid changing the 2692 * we have to do this after the loop above to avoid changing the
2660 * log tree while trying to change the log tree. 2693 * log tree while trying to change the log tree.
2661 */ 2694 */
2695 ret = 0;
2662 while (!list_empty(&ordered_sums)) { 2696 while (!list_empty(&ordered_sums)) {
2663 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 2697 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
2664 struct btrfs_ordered_sum, 2698 struct btrfs_ordered_sum,
2665 list); 2699 list);
2666 ret = btrfs_csum_file_blocks(trans, log, sums); 2700 if (!ret)
2667 BUG_ON(ret); 2701 ret = btrfs_csum_file_blocks(trans, log, sums);
2668 list_del(&sums->list); 2702 list_del(&sums->list);
2669 kfree(sums); 2703 kfree(sums);
2670 } 2704 }
2671 return 0; 2705 return ret;
2672} 2706}
2673 2707
2674/* log a single inode in the tree log. 2708/* log a single inode in the tree log.
@@ -2696,6 +2730,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
2696 struct btrfs_root *log = root->log_root; 2730 struct btrfs_root *log = root->log_root;
2697 struct extent_buffer *src = NULL; 2731 struct extent_buffer *src = NULL;
2698 u32 size; 2732 u32 size;
2733 int err = 0;
2699 int ret; 2734 int ret;
2700 int nritems; 2735 int nritems;
2701 int ins_start_slot = 0; 2736 int ins_start_slot = 0;
@@ -2738,7 +2773,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
2738 } else { 2773 } else {
2739 ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0); 2774 ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0);
2740 } 2775 }
2741 BUG_ON(ret); 2776 if (ret) {
2777 err = ret;
2778 goto out_unlock;
2779 }
2742 path->keep_locks = 1; 2780 path->keep_locks = 1;
2743 2781
2744 while (1) { 2782 while (1) {
@@ -2767,7 +2805,10 @@ again:
2767 2805
2768 ret = copy_items(trans, log, dst_path, src, ins_start_slot, 2806 ret = copy_items(trans, log, dst_path, src, ins_start_slot,
2769 ins_nr, inode_only); 2807 ins_nr, inode_only);
2770 BUG_ON(ret); 2808 if (ret) {
2809 err = ret;
2810 goto out_unlock;
2811 }
2771 ins_nr = 1; 2812 ins_nr = 1;
2772 ins_start_slot = path->slots[0]; 2813 ins_start_slot = path->slots[0];
2773next_slot: 2814next_slot:
@@ -2783,7 +2824,10 @@ next_slot:
2783 ret = copy_items(trans, log, dst_path, src, 2824 ret = copy_items(trans, log, dst_path, src,
2784 ins_start_slot, 2825 ins_start_slot,
2785 ins_nr, inode_only); 2826 ins_nr, inode_only);
2786 BUG_ON(ret); 2827 if (ret) {
2828 err = ret;
2829 goto out_unlock;
2830 }
2787 ins_nr = 0; 2831 ins_nr = 0;
2788 } 2832 }
2789 btrfs_release_path(root, path); 2833 btrfs_release_path(root, path);
@@ -2801,7 +2845,10 @@ next_slot:
2801 ret = copy_items(trans, log, dst_path, src, 2845 ret = copy_items(trans, log, dst_path, src,
2802 ins_start_slot, 2846 ins_start_slot,
2803 ins_nr, inode_only); 2847 ins_nr, inode_only);
2804 BUG_ON(ret); 2848 if (ret) {
2849 err = ret;
2850 goto out_unlock;
2851 }
2805 ins_nr = 0; 2852 ins_nr = 0;
2806 } 2853 }
2807 WARN_ON(ins_nr); 2854 WARN_ON(ins_nr);
@@ -2809,14 +2856,18 @@ next_slot:
2809 btrfs_release_path(root, path); 2856 btrfs_release_path(root, path);
2810 btrfs_release_path(log, dst_path); 2857 btrfs_release_path(log, dst_path);
2811 ret = log_directory_changes(trans, root, inode, path, dst_path); 2858 ret = log_directory_changes(trans, root, inode, path, dst_path);
2812 BUG_ON(ret); 2859 if (ret) {
2860 err = ret;
2861 goto out_unlock;
2862 }
2813 } 2863 }
2814 BTRFS_I(inode)->logged_trans = trans->transid; 2864 BTRFS_I(inode)->logged_trans = trans->transid;
2865out_unlock:
2815 mutex_unlock(&BTRFS_I(inode)->log_mutex); 2866 mutex_unlock(&BTRFS_I(inode)->log_mutex);
2816 2867
2817 btrfs_free_path(path); 2868 btrfs_free_path(path);
2818 btrfs_free_path(dst_path); 2869 btrfs_free_path(dst_path);
2819 return 0; 2870 return err;
2820} 2871}
2821 2872
2822/* 2873/*
@@ -2941,10 +2992,13 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
2941 goto end_no_trans; 2992 goto end_no_trans;
2942 } 2993 }
2943 2994
2944 start_log_trans(trans, root); 2995 ret = start_log_trans(trans, root);
2996 if (ret)
2997 goto end_trans;
2945 2998
2946 ret = btrfs_log_inode(trans, root, inode, inode_only); 2999 ret = btrfs_log_inode(trans, root, inode, inode_only);
2947 BUG_ON(ret); 3000 if (ret)
3001 goto end_trans;
2948 3002
2949 /* 3003 /*
2950 * for regular files, if its inode is already on disk, we don't 3004 * for regular files, if its inode is already on disk, we don't
@@ -2954,8 +3008,10 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
2954 */ 3008 */
2955 if (S_ISREG(inode->i_mode) && 3009 if (S_ISREG(inode->i_mode) &&
2956 BTRFS_I(inode)->generation <= last_committed && 3010 BTRFS_I(inode)->generation <= last_committed &&
2957 BTRFS_I(inode)->last_unlink_trans <= last_committed) 3011 BTRFS_I(inode)->last_unlink_trans <= last_committed) {
2958 goto no_parent; 3012 ret = 0;
3013 goto end_trans;
3014 }
2959 3015
2960 inode_only = LOG_INODE_EXISTS; 3016 inode_only = LOG_INODE_EXISTS;
2961 while (1) { 3017 while (1) {
@@ -2969,15 +3025,21 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
2969 if (BTRFS_I(inode)->generation > 3025 if (BTRFS_I(inode)->generation >
2970 root->fs_info->last_trans_committed) { 3026 root->fs_info->last_trans_committed) {
2971 ret = btrfs_log_inode(trans, root, inode, inode_only); 3027 ret = btrfs_log_inode(trans, root, inode, inode_only);
2972 BUG_ON(ret); 3028 if (ret)
3029 goto end_trans;
2973 } 3030 }
2974 if (IS_ROOT(parent)) 3031 if (IS_ROOT(parent))
2975 break; 3032 break;
2976 3033
2977 parent = parent->d_parent; 3034 parent = parent->d_parent;
2978 } 3035 }
2979no_parent:
2980 ret = 0; 3036 ret = 0;
3037end_trans:
3038 if (ret < 0) {
3039 BUG_ON(ret != -ENOSPC);
3040 root->fs_info->last_trans_log_full_commit = trans->transid;
3041 ret = 1;
3042 }
2981 btrfs_end_log_trans(root); 3043 btrfs_end_log_trans(root);
2982end_no_trans: 3044end_no_trans:
2983 return ret; 3045 return ret;
@@ -3019,7 +3081,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
3019 path = btrfs_alloc_path(); 3081 path = btrfs_alloc_path();
3020 BUG_ON(!path); 3082 BUG_ON(!path);
3021 3083
3022 trans = btrfs_start_transaction(fs_info->tree_root, 1); 3084 trans = btrfs_start_transaction(fs_info->tree_root, 0);
3023 3085
3024 wc.trans = trans; 3086 wc.trans = trans;
3025 wc.pin = 1; 3087 wc.pin = 1;
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 0776eacb5083..3dfae84c8cc8 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -25,6 +25,8 @@
25int btrfs_sync_log(struct btrfs_trans_handle *trans, 25int btrfs_sync_log(struct btrfs_trans_handle *trans,
26 struct btrfs_root *root); 26 struct btrfs_root *root);
27int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); 27int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
28int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
29 struct btrfs_fs_info *fs_info);
28int btrfs_recover_log_trees(struct btrfs_root *tree_root); 30int btrfs_recover_log_trees(struct btrfs_root *tree_root);
29int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 31int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
30 struct btrfs_root *root, struct dentry *dentry); 32 struct btrfs_root *root, struct dentry *dentry);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 9df8e3f1ccab..d6e3af8be95b 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -17,6 +17,7 @@
17 */ 17 */
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/bio.h> 19#include <linux/bio.h>
20#include <linux/slab.h>
20#include <linux/buffer_head.h> 21#include <linux/buffer_head.h>
21#include <linux/blkdev.h> 22#include <linux/blkdev.h>
22#include <linux/random.h> 23#include <linux/random.h>
@@ -1096,7 +1097,7 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
1096 if (!path) 1097 if (!path)
1097 return -ENOMEM; 1098 return -ENOMEM;
1098 1099
1099 trans = btrfs_start_transaction(root, 1); 1100 trans = btrfs_start_transaction(root, 0);
1100 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1101 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1101 key.type = BTRFS_DEV_ITEM_KEY; 1102 key.type = BTRFS_DEV_ITEM_KEY;
1102 key.offset = device->devid; 1103 key.offset = device->devid;
@@ -1485,7 +1486,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1485 goto error; 1486 goto error;
1486 } 1487 }
1487 1488
1488 trans = btrfs_start_transaction(root, 1); 1489 trans = btrfs_start_transaction(root, 0);
1489 lock_chunks(root); 1490 lock_chunks(root);
1490 1491
1491 device->barriers = 1; 1492 device->barriers = 1;
@@ -1750,9 +1751,10 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
1750 1751
1751 /* step one, relocate all the extents inside this chunk */ 1752 /* step one, relocate all the extents inside this chunk */
1752 ret = btrfs_relocate_block_group(extent_root, chunk_offset); 1753 ret = btrfs_relocate_block_group(extent_root, chunk_offset);
1753 BUG_ON(ret); 1754 if (ret)
1755 return ret;
1754 1756
1755 trans = btrfs_start_transaction(root, 1); 1757 trans = btrfs_start_transaction(root, 0);
1756 BUG_ON(!trans); 1758 BUG_ON(!trans);
1757 1759
1758 lock_chunks(root); 1760 lock_chunks(root);
@@ -1924,7 +1926,7 @@ int btrfs_balance(struct btrfs_root *dev_root)
1924 break; 1926 break;
1925 BUG_ON(ret); 1927 BUG_ON(ret);
1926 1928
1927 trans = btrfs_start_transaction(dev_root, 1); 1929 trans = btrfs_start_transaction(dev_root, 0);
1928 BUG_ON(!trans); 1930 BUG_ON(!trans);
1929 1931
1930 ret = btrfs_grow_device(trans, device, old_size); 1932 ret = btrfs_grow_device(trans, device, old_size);
@@ -2093,11 +2095,7 @@ again:
2093 } 2095 }
2094 2096
2095 /* Shrinking succeeded, else we would be at "done". */ 2097 /* Shrinking succeeded, else we would be at "done". */
2096 trans = btrfs_start_transaction(root, 1); 2098 trans = btrfs_start_transaction(root, 0);
2097 if (!trans) {
2098 ret = -ENOMEM;
2099 goto done;
2100 }
2101 lock_chunks(root); 2099 lock_chunks(root);
2102 2100
2103 device->disk_total_bytes = new_size; 2101 device->disk_total_bytes = new_size;
@@ -2198,9 +2196,9 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2198 min_stripes = 2; 2196 min_stripes = 2;
2199 } 2197 }
2200 if (type & (BTRFS_BLOCK_GROUP_RAID1)) { 2198 if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
2201 num_stripes = min_t(u64, 2, fs_devices->rw_devices); 2199 if (fs_devices->rw_devices < 2)
2202 if (num_stripes < 2)
2203 return -ENOSPC; 2200 return -ENOSPC;
2201 num_stripes = 2;
2204 min_stripes = 2; 2202 min_stripes = 2;
2205 } 2203 }
2206 if (type & (BTRFS_BLOCK_GROUP_RAID10)) { 2204 if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
@@ -2244,8 +2242,16 @@ again:
2244 do_div(calc_size, stripe_len); 2242 do_div(calc_size, stripe_len);
2245 calc_size *= stripe_len; 2243 calc_size *= stripe_len;
2246 } 2244 }
2245
2247 /* we don't want tiny stripes */ 2246 /* we don't want tiny stripes */
2248 calc_size = max_t(u64, min_stripe_size, calc_size); 2247 if (!looped)
2248 calc_size = max_t(u64, min_stripe_size, calc_size);
2249
2250 /*
2251 * we're about to do_div by the stripe_len so lets make sure
2252 * we end up with something bigger than a stripe
2253 */
2254 calc_size = max_t(u64, calc_size, stripe_len * 4);
2249 2255
2250 do_div(calc_size, stripe_len); 2256 do_div(calc_size, stripe_len);
2251 calc_size *= stripe_len; 2257 calc_size *= stripe_len;
@@ -3389,6 +3395,8 @@ int btrfs_read_chunk_tree(struct btrfs_root *root)
3389 key.type = 0; 3395 key.type = 0;
3390again: 3396again:
3391 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3397 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3398 if (ret < 0)
3399 goto error;
3392 while (1) { 3400 while (1) {
3393 leaf = path->nodes[0]; 3401 leaf = path->nodes[0];
3394 slot = path->slots[0]; 3402 slot = path->slots[0];
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 193b58f7d3f3..88ecbb215878 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -154,15 +154,10 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
154 if (trans) 154 if (trans)
155 return do_setxattr(trans, inode, name, value, size, flags); 155 return do_setxattr(trans, inode, name, value, size, flags);
156 156
157 ret = btrfs_reserve_metadata_space(root, 2); 157 trans = btrfs_start_transaction(root, 2);
158 if (ret) 158 if (IS_ERR(trans))
159 return ret; 159 return PTR_ERR(trans);
160 160
161 trans = btrfs_start_transaction(root, 1);
162 if (!trans) {
163 ret = -ENOMEM;
164 goto out;
165 }
166 btrfs_set_trans_block_group(trans, inode); 161 btrfs_set_trans_block_group(trans, inode);
167 162
168 ret = do_setxattr(trans, inode, name, value, size, flags); 163 ret = do_setxattr(trans, inode, name, value, size, flags);
@@ -174,7 +169,6 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
174 BUG_ON(ret); 169 BUG_ON(ret);
175out: 170out:
176 btrfs_end_transaction_throttle(trans, root); 171 btrfs_end_transaction_throttle(trans, root);
177 btrfs_unreserve_metadata_space(root, 2);
178 return ret; 172 return ret;
179} 173}
180 174
@@ -282,7 +276,7 @@ err:
282 * List of handlers for synthetic system.* attributes. All real ondisk 276 * List of handlers for synthetic system.* attributes. All real ondisk
283 * attributes are handled directly. 277 * attributes are handled directly.
284 */ 278 */
285struct xattr_handler *btrfs_xattr_handlers[] = { 279const struct xattr_handler *btrfs_xattr_handlers[] = {
286#ifdef CONFIG_BTRFS_FS_POSIX_ACL 280#ifdef CONFIG_BTRFS_FS_POSIX_ACL
287 &btrfs_xattr_acl_access_handler, 281 &btrfs_xattr_acl_access_handler,
288 &btrfs_xattr_acl_default_handler, 282 &btrfs_xattr_acl_default_handler,
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index 721efa0346e0..7a43fd640bbb 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -21,9 +21,9 @@
21 21
22#include <linux/xattr.h> 22#include <linux/xattr.h>
23 23
24extern struct xattr_handler btrfs_xattr_acl_access_handler; 24extern const struct xattr_handler btrfs_xattr_acl_access_handler;
25extern struct xattr_handler btrfs_xattr_acl_default_handler; 25extern const struct xattr_handler btrfs_xattr_acl_default_handler;
26extern struct xattr_handler *btrfs_xattr_handlers[]; 26extern const struct xattr_handler *btrfs_xattr_handlers[];
27 27
28extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name, 28extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
29 void *buffer, size_t size); 29 void *buffer, size_t size);
diff --git a/fs/buffer.c b/fs/buffer.c
index c9c266db0624..d54812b198e9 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -275,6 +275,7 @@ void invalidate_bdev(struct block_device *bdev)
275 return; 275 return;
276 276
277 invalidate_bh_lrus(); 277 invalidate_bh_lrus();
278 lru_add_drain_all(); /* make sure all lru add caches are flushed */
278 invalidate_mapping_pages(mapping, 0, -1); 279 invalidate_mapping_pages(mapping, 0, -1);
279} 280}
280EXPORT_SYMBOL(invalidate_bdev); 281EXPORT_SYMBOL(invalidate_bdev);
@@ -560,26 +561,17 @@ repeat:
560 return err; 561 return err;
561} 562}
562 563
563static void do_thaw_all(struct work_struct *work) 564static void do_thaw_one(struct super_block *sb, void *unused)
564{ 565{
565 struct super_block *sb;
566 char b[BDEVNAME_SIZE]; 566 char b[BDEVNAME_SIZE];
567 while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
568 printk(KERN_WARNING "Emergency Thaw on %s\n",
569 bdevname(sb->s_bdev, b));
570}
567 571
568 spin_lock(&sb_lock); 572static void do_thaw_all(struct work_struct *work)
569restart: 573{
570 list_for_each_entry(sb, &super_blocks, s_list) { 574 iterate_supers(do_thaw_one, NULL);
571 sb->s_count++;
572 spin_unlock(&sb_lock);
573 down_read(&sb->s_umount);
574 while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
575 printk(KERN_WARNING "Emergency Thaw on %s\n",
576 bdevname(sb->s_bdev, b));
577 up_read(&sb->s_umount);
578 spin_lock(&sb_lock);
579 if (__put_super_and_need_restart(sb))
580 goto restart;
581 }
582 spin_unlock(&sb_lock);
583 kfree(work); 575 kfree(work);
584 printk(KERN_WARNING "Emergency Thaw complete\n"); 576 printk(KERN_WARNING "Emergency Thaw complete\n");
585} 577}
@@ -1957,14 +1949,11 @@ static int __block_commit_write(struct inode *inode, struct page *page,
1957} 1949}
1958 1950
1959/* 1951/*
1960 * block_write_begin takes care of the basic task of block allocation and 1952 * Filesystems implementing the new truncate sequence should use the
1961 * bringing partial write blocks uptodate first. 1953 * _newtrunc postfix variant which won't incorrectly call vmtruncate.
1962 * 1954 * The filesystem needs to handle block truncation upon failure.
1963 * If *pagep is not NULL, then block_write_begin uses the locked page
1964 * at *pagep rather than allocating its own. In this case, the page will
1965 * not be unlocked or deallocated on failure.
1966 */ 1955 */
1967int block_write_begin(struct file *file, struct address_space *mapping, 1956int block_write_begin_newtrunc(struct file *file, struct address_space *mapping,
1968 loff_t pos, unsigned len, unsigned flags, 1957 loff_t pos, unsigned len, unsigned flags,
1969 struct page **pagep, void **fsdata, 1958 struct page **pagep, void **fsdata,
1970 get_block_t *get_block) 1959 get_block_t *get_block)
@@ -2000,20 +1989,50 @@ int block_write_begin(struct file *file, struct address_space *mapping,
2000 unlock_page(page); 1989 unlock_page(page);
2001 page_cache_release(page); 1990 page_cache_release(page);
2002 *pagep = NULL; 1991 *pagep = NULL;
2003
2004 /*
2005 * prepare_write() may have instantiated a few blocks
2006 * outside i_size. Trim these off again. Don't need
2007 * i_size_read because we hold i_mutex.
2008 */
2009 if (pos + len > inode->i_size)
2010 vmtruncate(inode, inode->i_size);
2011 } 1992 }
2012 } 1993 }
2013 1994
2014out: 1995out:
2015 return status; 1996 return status;
2016} 1997}
1998EXPORT_SYMBOL(block_write_begin_newtrunc);
1999
2000/*
2001 * block_write_begin takes care of the basic task of block allocation and
2002 * bringing partial write blocks uptodate first.
2003 *
2004 * If *pagep is not NULL, then block_write_begin uses the locked page
2005 * at *pagep rather than allocating its own. In this case, the page will
2006 * not be unlocked or deallocated on failure.
2007 */
2008int block_write_begin(struct file *file, struct address_space *mapping,
2009 loff_t pos, unsigned len, unsigned flags,
2010 struct page **pagep, void **fsdata,
2011 get_block_t *get_block)
2012{
2013 int ret;
2014
2015 ret = block_write_begin_newtrunc(file, mapping, pos, len, flags,
2016 pagep, fsdata, get_block);
2017
2018 /*
2019 * prepare_write() may have instantiated a few blocks
2020 * outside i_size. Trim these off again. Don't need
2021 * i_size_read because we hold i_mutex.
2022 *
2023 * Filesystems which pass down their own page also cannot
2024 * call into vmtruncate here because it would lead to lock
2025 * inversion problems (*pagep is locked). This is a further
2026 * example of where the old truncate sequence is inadequate.
2027 */
2028 if (unlikely(ret) && *pagep == NULL) {
2029 loff_t isize = mapping->host->i_size;
2030 if (pos + len > isize)
2031 vmtruncate(mapping->host, isize);
2032 }
2033
2034 return ret;
2035}
2017EXPORT_SYMBOL(block_write_begin); 2036EXPORT_SYMBOL(block_write_begin);
2018 2037
2019int block_write_end(struct file *file, struct address_space *mapping, 2038int block_write_end(struct file *file, struct address_space *mapping,
@@ -2332,7 +2351,7 @@ out:
2332 * For moronic filesystems that do not allow holes in file. 2351 * For moronic filesystems that do not allow holes in file.
2333 * We may have to extend the file. 2352 * We may have to extend the file.
2334 */ 2353 */
2335int cont_write_begin(struct file *file, struct address_space *mapping, 2354int cont_write_begin_newtrunc(struct file *file, struct address_space *mapping,
2336 loff_t pos, unsigned len, unsigned flags, 2355 loff_t pos, unsigned len, unsigned flags,
2337 struct page **pagep, void **fsdata, 2356 struct page **pagep, void **fsdata,
2338 get_block_t *get_block, loff_t *bytes) 2357 get_block_t *get_block, loff_t *bytes)
@@ -2353,11 +2372,30 @@ int cont_write_begin(struct file *file, struct address_space *mapping,
2353 } 2372 }
2354 2373
2355 *pagep = NULL; 2374 *pagep = NULL;
2356 err = block_write_begin(file, mapping, pos, len, 2375 err = block_write_begin_newtrunc(file, mapping, pos, len,
2357 flags, pagep, fsdata, get_block); 2376 flags, pagep, fsdata, get_block);
2358out: 2377out:
2359 return err; 2378 return err;
2360} 2379}
2380EXPORT_SYMBOL(cont_write_begin_newtrunc);
2381
2382int cont_write_begin(struct file *file, struct address_space *mapping,
2383 loff_t pos, unsigned len, unsigned flags,
2384 struct page **pagep, void **fsdata,
2385 get_block_t *get_block, loff_t *bytes)
2386{
2387 int ret;
2388
2389 ret = cont_write_begin_newtrunc(file, mapping, pos, len, flags,
2390 pagep, fsdata, get_block, bytes);
2391 if (unlikely(ret)) {
2392 loff_t isize = mapping->host->i_size;
2393 if (pos + len > isize)
2394 vmtruncate(mapping->host, isize);
2395 }
2396
2397 return ret;
2398}
2361EXPORT_SYMBOL(cont_write_begin); 2399EXPORT_SYMBOL(cont_write_begin);
2362 2400
2363int block_prepare_write(struct page *page, unsigned from, unsigned to, 2401int block_prepare_write(struct page *page, unsigned from, unsigned to,
@@ -2389,7 +2427,7 @@ EXPORT_SYMBOL(block_commit_write);
2389 * 2427 *
2390 * We are not allowed to take the i_mutex here so we have to play games to 2428 * We are not allowed to take the i_mutex here so we have to play games to
2391 * protect against truncate races as the page could now be beyond EOF. Because 2429 * protect against truncate races as the page could now be beyond EOF. Because
2392 * vmtruncate() writes the inode size before removing pages, once we have the 2430 * truncate writes the inode size before removing pages, once we have the
2393 * page lock we can determine safely if the page is beyond EOF. If it is not 2431 * page lock we can determine safely if the page is beyond EOF. If it is not
2394 * beyond EOF, then the page is guaranteed safe against truncation until we 2432 * beyond EOF, then the page is guaranteed safe against truncation until we
2395 * unlock the page. 2433 * unlock the page.
@@ -2472,10 +2510,11 @@ static void attach_nobh_buffers(struct page *page, struct buffer_head *head)
2472} 2510}
2473 2511
2474/* 2512/*
2475 * On entry, the page is fully not uptodate. 2513 * Filesystems implementing the new truncate sequence should use the
2476 * On exit the page is fully uptodate in the areas outside (from,to) 2514 * _newtrunc postfix variant which won't incorrectly call vmtruncate.
2515 * The filesystem needs to handle block truncation upon failure.
2477 */ 2516 */
2478int nobh_write_begin(struct file *file, struct address_space *mapping, 2517int nobh_write_begin_newtrunc(struct file *file, struct address_space *mapping,
2479 loff_t pos, unsigned len, unsigned flags, 2518 loff_t pos, unsigned len, unsigned flags,
2480 struct page **pagep, void **fsdata, 2519 struct page **pagep, void **fsdata,
2481 get_block_t *get_block) 2520 get_block_t *get_block)
@@ -2508,8 +2547,8 @@ int nobh_write_begin(struct file *file, struct address_space *mapping,
2508 unlock_page(page); 2547 unlock_page(page);
2509 page_cache_release(page); 2548 page_cache_release(page);
2510 *pagep = NULL; 2549 *pagep = NULL;
2511 return block_write_begin(file, mapping, pos, len, flags, pagep, 2550 return block_write_begin_newtrunc(file, mapping, pos, len,
2512 fsdata, get_block); 2551 flags, pagep, fsdata, get_block);
2513 } 2552 }
2514 2553
2515 if (PageMappedToDisk(page)) 2554 if (PageMappedToDisk(page))
@@ -2613,8 +2652,34 @@ out_release:
2613 page_cache_release(page); 2652 page_cache_release(page);
2614 *pagep = NULL; 2653 *pagep = NULL;
2615 2654
2616 if (pos + len > inode->i_size) 2655 return ret;
2617 vmtruncate(inode, inode->i_size); 2656}
2657EXPORT_SYMBOL(nobh_write_begin_newtrunc);
2658
2659/*
2660 * On entry, the page is fully not uptodate.
2661 * On exit the page is fully uptodate in the areas outside (from,to)
2662 */
2663int nobh_write_begin(struct file *file, struct address_space *mapping,
2664 loff_t pos, unsigned len, unsigned flags,
2665 struct page **pagep, void **fsdata,
2666 get_block_t *get_block)
2667{
2668 int ret;
2669
2670 ret = nobh_write_begin_newtrunc(file, mapping, pos, len, flags,
2671 pagep, fsdata, get_block);
2672
2673 /*
2674 * prepare_write() may have instantiated a few blocks
2675 * outside i_size. Trim these off again. Don't need
2676 * i_size_read because we hold i_mutex.
2677 */
2678 if (unlikely(ret)) {
2679 loff_t isize = mapping->host->i_size;
2680 if (pos + len > isize)
2681 vmtruncate(mapping->host, isize);
2682 }
2618 2683
2619 return ret; 2684 return ret;
2620} 2685}
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 27089311fbea..37fe101a4e0d 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -9,6 +9,7 @@
9 * 2 of the Licence, or (at your option) any later version. 9 * 2 of the Licence, or (at your option) any later version.
10 */ 10 */
11 11
12#include <linux/slab.h>
12#include <linux/mount.h> 13#include <linux/mount.h>
13#include <linux/buffer_head.h> 14#include <linux/buffer_head.h>
14#include "internal.h" 15#include "internal.h"
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index f7c255f9c624..a8cd821226da 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -34,6 +34,7 @@ struct cachefiles_object {
34 loff_t i_size; /* object size */ 34 loff_t i_size; /* object size */
35 unsigned long flags; 35 unsigned long flags;
36#define CACHEFILES_OBJECT_ACTIVE 0 /* T if marked active */ 36#define CACHEFILES_OBJECT_ACTIVE 0 /* T if marked active */
37#define CACHEFILES_OBJECT_BURIED 1 /* T if preemptively buried */
37 atomic_t usage; /* object usage count */ 38 atomic_t usage; /* object usage count */
38 uint8_t type; /* object type */ 39 uint8_t type; /* object type */
39 uint8_t new; /* T if object new */ 40 uint8_t new; /* T if object new */
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index eeb4986ea7db..f4a7840bf42c 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -19,6 +19,7 @@
19#include <linux/mount.h> 19#include <linux/mount.h>
20#include <linux/namei.h> 20#include <linux/namei.h>
21#include <linux/security.h> 21#include <linux/security.h>
22#include <linux/slab.h>
22#include "internal.h" 23#include "internal.h"
23 24
24#define CACHEFILES_KEYBUF_SIZE 512 25#define CACHEFILES_KEYBUF_SIZE 512
@@ -92,6 +93,59 @@ static noinline void cachefiles_printk_object(struct cachefiles_object *object,
92} 93}
93 94
94/* 95/*
96 * mark the owner of a dentry, if there is one, to indicate that that dentry
97 * has been preemptively deleted
98 * - the caller must hold the i_mutex on the dentry's parent as required to
99 * call vfs_unlink(), vfs_rmdir() or vfs_rename()
100 */
101static void cachefiles_mark_object_buried(struct cachefiles_cache *cache,
102 struct dentry *dentry)
103{
104 struct cachefiles_object *object;
105 struct rb_node *p;
106
107 _enter(",'%*.*s'",
108 dentry->d_name.len, dentry->d_name.len, dentry->d_name.name);
109
110 write_lock(&cache->active_lock);
111
112 p = cache->active_nodes.rb_node;
113 while (p) {
114 object = rb_entry(p, struct cachefiles_object, active_node);
115 if (object->dentry > dentry)
116 p = p->rb_left;
117 else if (object->dentry < dentry)
118 p = p->rb_right;
119 else
120 goto found_dentry;
121 }
122
123 write_unlock(&cache->active_lock);
124 _leave(" [no owner]");
125 return;
126
127 /* found the dentry for */
128found_dentry:
129 kdebug("preemptive burial: OBJ%x [%s] %p",
130 object->fscache.debug_id,
131 fscache_object_states[object->fscache.state],
132 dentry);
133
134 if (object->fscache.state < FSCACHE_OBJECT_DYING) {
135 printk(KERN_ERR "\n");
136 printk(KERN_ERR "CacheFiles: Error:"
137 " Can't preemptively bury live object\n");
138 cachefiles_printk_object(object, NULL);
139 } else if (test_and_set_bit(CACHEFILES_OBJECT_BURIED, &object->flags)) {
140 printk(KERN_ERR "CacheFiles: Error:"
141 " Object already preemptively buried\n");
142 }
143
144 write_unlock(&cache->active_lock);
145 _leave(" [owner marked]");
146}
147
148/*
95 * record the fact that an object is now active 149 * record the fact that an object is now active
96 */ 150 */
97static int cachefiles_mark_object_active(struct cachefiles_cache *cache, 151static int cachefiles_mark_object_active(struct cachefiles_cache *cache,
@@ -218,7 +272,8 @@ requeue:
218 */ 272 */
219static int cachefiles_bury_object(struct cachefiles_cache *cache, 273static int cachefiles_bury_object(struct cachefiles_cache *cache,
220 struct dentry *dir, 274 struct dentry *dir,
221 struct dentry *rep) 275 struct dentry *rep,
276 bool preemptive)
222{ 277{
223 struct dentry *grave, *trap; 278 struct dentry *grave, *trap;
224 char nbuffer[8 + 8 + 1]; 279 char nbuffer[8 + 8 + 1];
@@ -228,11 +283,16 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache,
228 dir->d_name.len, dir->d_name.len, dir->d_name.name, 283 dir->d_name.len, dir->d_name.len, dir->d_name.name,
229 rep->d_name.len, rep->d_name.len, rep->d_name.name); 284 rep->d_name.len, rep->d_name.len, rep->d_name.name);
230 285
286 _debug("remove %p from %p", rep, dir);
287
231 /* non-directories can just be unlinked */ 288 /* non-directories can just be unlinked */
232 if (!S_ISDIR(rep->d_inode->i_mode)) { 289 if (!S_ISDIR(rep->d_inode->i_mode)) {
233 _debug("unlink stale object"); 290 _debug("unlink stale object");
234 ret = vfs_unlink(dir->d_inode, rep); 291 ret = vfs_unlink(dir->d_inode, rep);
235 292
293 if (preemptive)
294 cachefiles_mark_object_buried(cache, rep);
295
236 mutex_unlock(&dir->d_inode->i_mutex); 296 mutex_unlock(&dir->d_inode->i_mutex);
237 297
238 if (ret == -EIO) 298 if (ret == -EIO)
@@ -324,6 +384,9 @@ try_again:
324 if (ret != 0 && ret != -ENOMEM) 384 if (ret != 0 && ret != -ENOMEM)
325 cachefiles_io_error(cache, "Rename failed with error %d", ret); 385 cachefiles_io_error(cache, "Rename failed with error %d", ret);
326 386
387 if (preemptive)
388 cachefiles_mark_object_buried(cache, rep);
389
327 unlock_rename(cache->graveyard, dir); 390 unlock_rename(cache->graveyard, dir);
328 dput(grave); 391 dput(grave);
329 _leave(" = 0"); 392 _leave(" = 0");
@@ -339,7 +402,7 @@ int cachefiles_delete_object(struct cachefiles_cache *cache,
339 struct dentry *dir; 402 struct dentry *dir;
340 int ret; 403 int ret;
341 404
342 _enter(",{%p}", object->dentry); 405 _enter(",OBJ%x{%p}", object->fscache.debug_id, object->dentry);
343 406
344 ASSERT(object->dentry); 407 ASSERT(object->dentry);
345 ASSERT(object->dentry->d_inode); 408 ASSERT(object->dentry->d_inode);
@@ -349,15 +412,25 @@ int cachefiles_delete_object(struct cachefiles_cache *cache,
349 412
350 mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); 413 mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
351 414
352 /* we need to check that our parent is _still_ our parent - it may have 415 if (test_bit(CACHEFILES_OBJECT_BURIED, &object->flags)) {
353 * been renamed */ 416 /* object allocation for the same key preemptively deleted this
354 if (dir == object->dentry->d_parent) { 417 * object's file so that it could create its own file */
355 ret = cachefiles_bury_object(cache, dir, object->dentry); 418 _debug("object preemptively buried");
356 } else {
357 /* it got moved, presumably by cachefilesd culling it, so it's
358 * no longer in the key path and we can ignore it */
359 mutex_unlock(&dir->d_inode->i_mutex); 419 mutex_unlock(&dir->d_inode->i_mutex);
360 ret = 0; 420 ret = 0;
421 } else {
422 /* we need to check that our parent is _still_ our parent - it
423 * may have been renamed */
424 if (dir == object->dentry->d_parent) {
425 ret = cachefiles_bury_object(cache, dir,
426 object->dentry, false);
427 } else {
428 /* it got moved, presumably by cachefilesd culling it,
429 * so it's no longer in the key path and we can ignore
430 * it */
431 mutex_unlock(&dir->d_inode->i_mutex);
432 ret = 0;
433 }
361 } 434 }
362 435
363 dput(dir); 436 dput(dir);
@@ -380,7 +453,9 @@ int cachefiles_walk_to_object(struct cachefiles_object *parent,
380 const char *name; 453 const char *name;
381 int ret, nlen; 454 int ret, nlen;
382 455
383 _enter("{%p},,%s,", parent->dentry, key); 456 _enter("OBJ%x{%p},OBJ%x,%s,",
457 parent->fscache.debug_id, parent->dentry,
458 object->fscache.debug_id, key);
384 459
385 cache = container_of(parent->fscache.cache, 460 cache = container_of(parent->fscache.cache,
386 struct cachefiles_cache, cache); 461 struct cachefiles_cache, cache);
@@ -508,7 +583,7 @@ lookup_again:
508 * mutex) */ 583 * mutex) */
509 object->dentry = NULL; 584 object->dentry = NULL;
510 585
511 ret = cachefiles_bury_object(cache, dir, next); 586 ret = cachefiles_bury_object(cache, dir, next, true);
512 dput(next); 587 dput(next);
513 next = NULL; 588 next = NULL;
514 589
@@ -827,7 +902,7 @@ int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir,
827 /* actually remove the victim (drops the dir mutex) */ 902 /* actually remove the victim (drops the dir mutex) */
828 _debug("bury"); 903 _debug("bury");
829 904
830 ret = cachefiles_bury_object(cache, dir, victim); 905 ret = cachefiles_bury_object(cache, dir, victim, false);
831 if (ret < 0) 906 if (ret < 0)
832 goto error; 907 goto error;
833 908
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 1d8332563863..0f0d41fbb03f 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -10,6 +10,7 @@
10 */ 10 */
11 11
12#include <linux/mount.h> 12#include <linux/mount.h>
13#include <linux/slab.h>
13#include <linux/file.h> 14#include <linux/file.h>
14#include "internal.h" 15#include "internal.h"
15 16
diff --git a/fs/cachefiles/security.c b/fs/cachefiles/security.c
index b5808cdb2232..039b5011d83b 100644
--- a/fs/cachefiles/security.c
+++ b/fs/cachefiles/security.c
@@ -77,6 +77,8 @@ static int cachefiles_check_cache_dir(struct cachefiles_cache *cache,
77/* 77/*
78 * check the security details of the on-disk cache 78 * check the security details of the on-disk cache
79 * - must be called with security override in force 79 * - must be called with security override in force
80 * - must return with a security override in force - even in the case of an
81 * error
80 */ 82 */
81int cachefiles_determine_cache_security(struct cachefiles_cache *cache, 83int cachefiles_determine_cache_security(struct cachefiles_cache *cache,
82 struct dentry *root, 84 struct dentry *root,
@@ -99,6 +101,8 @@ int cachefiles_determine_cache_security(struct cachefiles_cache *cache,
99 * which create files */ 101 * which create files */
100 ret = set_create_files_as(new, root->d_inode); 102 ret = set_create_files_as(new, root->d_inode);
101 if (ret < 0) { 103 if (ret < 0) {
104 abort_creds(new);
105 cachefiles_begin_secure(cache, _saved_cred);
102 _leave(" = %d [cfa]", ret); 106 _leave(" = %d [cfa]", ret);
103 return ret; 107 return ret;
104 } 108 }
diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
index f3e7a0bf068b..e18b183b47e1 100644
--- a/fs/cachefiles/xattr.c
+++ b/fs/cachefiles/xattr.c
@@ -16,6 +16,7 @@
16#include <linux/fsnotify.h> 16#include <linux/fsnotify.h>
17#include <linux/quotaops.h> 17#include <linux/quotaops.h>
18#include <linux/xattr.h> 18#include <linux/xattr.h>
19#include <linux/slab.h>
19#include "internal.h" 20#include "internal.h"
20 21
21static const char cachefiles_xattr_cache[] = 22static const char cachefiles_xattr_cache[] =
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 23bb0ceabe31..d9c60b84949a 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -5,6 +5,7 @@
5#include <linux/mm.h> 5#include <linux/mm.h>
6#include <linux/pagemap.h> 6#include <linux/pagemap.h>
7#include <linux/writeback.h> /* generic_writepages */ 7#include <linux/writeback.h> /* generic_writepages */
8#include <linux/slab.h>
8#include <linux/pagevec.h> 9#include <linux/pagevec.h>
9#include <linux/task_io_accounting_ops.h> 10#include <linux/task_io_accounting_ops.h>
10 11
@@ -273,7 +274,6 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
273 struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc; 274 struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
274 int rc = 0; 275 int rc = 0;
275 struct page **pages; 276 struct page **pages;
276 struct pagevec pvec;
277 loff_t offset; 277 loff_t offset;
278 u64 len; 278 u64 len;
279 279
@@ -296,8 +296,6 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
296 if (rc < 0) 296 if (rc < 0)
297 goto out; 297 goto out;
298 298
299 /* set uptodate and add to lru in pagevec-sized chunks */
300 pagevec_init(&pvec, 0);
301 for (; !list_empty(page_list) && len > 0; 299 for (; !list_empty(page_list) && len > 0;
302 rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) { 300 rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) {
303 struct page *page = 301 struct page *page =
@@ -311,7 +309,7 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
311 zero_user_segment(page, s, PAGE_CACHE_SIZE); 309 zero_user_segment(page, s, PAGE_CACHE_SIZE);
312 } 310 }
313 311
314 if (add_to_page_cache(page, mapping, page->index, GFP_NOFS)) { 312 if (add_to_page_cache_lru(page, mapping, page->index, GFP_NOFS)) {
315 page_cache_release(page); 313 page_cache_release(page);
316 dout("readpages %p add_to_page_cache failed %p\n", 314 dout("readpages %p add_to_page_cache failed %p\n",
317 inode, page); 315 inode, page);
@@ -322,10 +320,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
322 flush_dcache_page(page); 320 flush_dcache_page(page);
323 SetPageUptodate(page); 321 SetPageUptodate(page);
324 unlock_page(page); 322 unlock_page(page);
325 if (pagevec_add(&pvec, page) == 0) 323 page_cache_release(page);
326 pagevec_lru_add_file(&pvec); /* add to lru */
327 } 324 }
328 pagevec_lru_add_file(&pvec);
329 rc = 0; 325 rc = 0;
330 326
331out: 327out:
@@ -336,16 +332,15 @@ out:
336/* 332/*
337 * Get ref for the oldest snapc for an inode with dirty data... that is, the 333 * Get ref for the oldest snapc for an inode with dirty data... that is, the
338 * only snap context we are allowed to write back. 334 * only snap context we are allowed to write back.
339 *
340 * Caller holds i_lock.
341 */ 335 */
342static struct ceph_snap_context *__get_oldest_context(struct inode *inode, 336static struct ceph_snap_context *get_oldest_context(struct inode *inode,
343 u64 *snap_size) 337 u64 *snap_size)
344{ 338{
345 struct ceph_inode_info *ci = ceph_inode(inode); 339 struct ceph_inode_info *ci = ceph_inode(inode);
346 struct ceph_snap_context *snapc = NULL; 340 struct ceph_snap_context *snapc = NULL;
347 struct ceph_cap_snap *capsnap = NULL; 341 struct ceph_cap_snap *capsnap = NULL;
348 342
343 spin_lock(&inode->i_lock);
349 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { 344 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
350 dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap, 345 dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap,
351 capsnap->context, capsnap->dirty_pages); 346 capsnap->context, capsnap->dirty_pages);
@@ -356,21 +351,11 @@ static struct ceph_snap_context *__get_oldest_context(struct inode *inode,
356 break; 351 break;
357 } 352 }
358 } 353 }
359 if (!snapc && ci->i_snap_realm) { 354 if (!snapc && ci->i_head_snapc) {
360 snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context); 355 snapc = ceph_get_snap_context(ci->i_head_snapc);
361 dout(" head snapc %p has %d dirty pages\n", 356 dout(" head snapc %p has %d dirty pages\n",
362 snapc, ci->i_wrbuffer_ref_head); 357 snapc, ci->i_wrbuffer_ref_head);
363 } 358 }
364 return snapc;
365}
366
367static struct ceph_snap_context *get_oldest_context(struct inode *inode,
368 u64 *snap_size)
369{
370 struct ceph_snap_context *snapc = NULL;
371
372 spin_lock(&inode->i_lock);
373 snapc = __get_oldest_context(inode, snap_size);
374 spin_unlock(&inode->i_lock); 359 spin_unlock(&inode->i_lock);
375 return snapc; 360 return snapc;
376} 361}
@@ -391,7 +376,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
391 int len = PAGE_CACHE_SIZE; 376 int len = PAGE_CACHE_SIZE;
392 loff_t i_size; 377 loff_t i_size;
393 int err = 0; 378 int err = 0;
394 struct ceph_snap_context *snapc; 379 struct ceph_snap_context *snapc, *oldest;
395 u64 snap_size = 0; 380 u64 snap_size = 0;
396 long writeback_stat; 381 long writeback_stat;
397 382
@@ -412,13 +397,16 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
412 dout("writepage %p page %p not dirty?\n", inode, page); 397 dout("writepage %p page %p not dirty?\n", inode, page);
413 goto out; 398 goto out;
414 } 399 }
415 if (snapc != get_oldest_context(inode, &snap_size)) { 400 oldest = get_oldest_context(inode, &snap_size);
401 if (snapc->seq > oldest->seq) {
416 dout("writepage %p page %p snapc %p not writeable - noop\n", 402 dout("writepage %p page %p snapc %p not writeable - noop\n",
417 inode, page, (void *)page->private); 403 inode, page, (void *)page->private);
418 /* we should only noop if called by kswapd */ 404 /* we should only noop if called by kswapd */
419 WARN_ON((current->flags & PF_MEMALLOC) == 0); 405 WARN_ON((current->flags & PF_MEMALLOC) == 0);
406 ceph_put_snap_context(oldest);
420 goto out; 407 goto out;
421 } 408 }
409 ceph_put_snap_context(oldest);
422 410
423 /* is this a partial page at end of file? */ 411 /* is this a partial page at end of file? */
424 if (snap_size) 412 if (snap_size)
@@ -457,7 +445,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
457 ClearPagePrivate(page); 445 ClearPagePrivate(page);
458 end_page_writeback(page); 446 end_page_writeback(page);
459 ceph_put_wrbuffer_cap_refs(ci, 1, snapc); 447 ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
460 ceph_put_snap_context(snapc); 448 ceph_put_snap_context(snapc); /* page's reference */
461out: 449out:
462 return err; 450 return err;
463} 451}
@@ -511,12 +499,11 @@ static void writepages_finish(struct ceph_osd_request *req,
511 int i; 499 int i;
512 struct ceph_snap_context *snapc = req->r_snapc; 500 struct ceph_snap_context *snapc = req->r_snapc;
513 struct address_space *mapping = inode->i_mapping; 501 struct address_space *mapping = inode->i_mapping;
514 struct writeback_control *wbc = req->r_wbc;
515 __s32 rc = -EIO; 502 __s32 rc = -EIO;
516 u64 bytes = 0; 503 u64 bytes = 0;
517 struct ceph_client *client = ceph_inode_to_client(inode); 504 struct ceph_client *client = ceph_inode_to_client(inode);
518 long writeback_stat; 505 long writeback_stat;
519 unsigned issued = __ceph_caps_issued(ci, NULL); 506 unsigned issued = ceph_caps_issued(ci);
520 507
521 /* parse reply */ 508 /* parse reply */
522 replyhead = msg->front.iov_base; 509 replyhead = msg->front.iov_base;
@@ -553,13 +540,9 @@ static void writepages_finish(struct ceph_osd_request *req,
553 clear_bdi_congested(&client->backing_dev_info, 540 clear_bdi_congested(&client->backing_dev_info,
554 BLK_RW_ASYNC); 541 BLK_RW_ASYNC);
555 542
556 if (i >= wrote) { 543 ceph_put_snap_context((void *)page->private);
557 dout("inode %p skipping page %p\n", inode, page);
558 wbc->pages_skipped++;
559 }
560 page->private = 0; 544 page->private = 0;
561 ClearPagePrivate(page); 545 ClearPagePrivate(page);
562 ceph_put_snap_context(snapc);
563 dout("unlocking %d %p\n", i, page); 546 dout("unlocking %d %p\n", i, page);
564 end_page_writeback(page); 547 end_page_writeback(page);
565 548
@@ -580,7 +563,7 @@ static void writepages_finish(struct ceph_osd_request *req,
580 ceph_release_pages(req->r_pages, req->r_num_pages); 563 ceph_release_pages(req->r_pages, req->r_num_pages);
581 if (req->r_pages_from_pool) 564 if (req->r_pages_from_pool)
582 mempool_free(req->r_pages, 565 mempool_free(req->r_pages,
583 ceph_client(inode->i_sb)->wb_pagevec_pool); 566 ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool);
584 else 567 else
585 kfree(req->r_pages); 568 kfree(req->r_pages);
586 ceph_osdc_put_request(req); 569 ceph_osdc_put_request(req);
@@ -617,7 +600,7 @@ static int ceph_writepages_start(struct address_space *mapping,
617 int range_whole = 0; 600 int range_whole = 0;
618 int should_loop = 1; 601 int should_loop = 1;
619 pgoff_t max_pages = 0, max_pages_ever = 0; 602 pgoff_t max_pages = 0, max_pages_ever = 0;
620 struct ceph_snap_context *snapc = NULL, *last_snapc = NULL; 603 struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc;
621 struct pagevec pvec; 604 struct pagevec pvec;
622 int done = 0; 605 int done = 0;
623 int rc = 0; 606 int rc = 0;
@@ -769,9 +752,10 @@ get_more_pages:
769 } 752 }
770 753
771 /* only if matching snap context */ 754 /* only if matching snap context */
772 if (snapc != (void *)page->private) { 755 pgsnapc = (void *)page->private;
773 dout("page snapc %p != oldest %p\n", 756 if (pgsnapc->seq > snapc->seq) {
774 (void *)page->private, snapc); 757 dout("page snapc %p %lld > oldest %p %lld\n",
758 pgsnapc, pgsnapc->seq, snapc, snapc->seq);
775 unlock_page(page); 759 unlock_page(page);
776 if (!locked_pages) 760 if (!locked_pages)
777 continue; /* keep looking for snap */ 761 continue; /* keep looking for snap */
@@ -805,7 +789,6 @@ get_more_pages:
805 alloc_page_vec(client, req); 789 alloc_page_vec(client, req);
806 req->r_callback = writepages_finish; 790 req->r_callback = writepages_finish;
807 req->r_inode = inode; 791 req->r_inode = inode;
808 req->r_wbc = wbc;
809 } 792 }
810 793
811 /* note position of first page in pvec */ 794 /* note position of first page in pvec */
@@ -913,12 +896,19 @@ static int context_is_writeable_or_written(struct inode *inode,
913 struct ceph_snap_context *snapc) 896 struct ceph_snap_context *snapc)
914{ 897{
915 struct ceph_snap_context *oldest = get_oldest_context(inode, NULL); 898 struct ceph_snap_context *oldest = get_oldest_context(inode, NULL);
916 return !oldest || snapc->seq <= oldest->seq; 899 int ret = !oldest || snapc->seq <= oldest->seq;
900
901 ceph_put_snap_context(oldest);
902 return ret;
917} 903}
918 904
919/* 905/*
920 * We are only allowed to write into/dirty the page if the page is 906 * We are only allowed to write into/dirty the page if the page is
921 * clean, or already dirty within the same snap context. 907 * clean, or already dirty within the same snap context.
908 *
909 * called with page locked.
910 * return success with page locked,
911 * or any failure (incl -EAGAIN) with page unlocked.
922 */ 912 */
923static int ceph_update_writeable_page(struct file *file, 913static int ceph_update_writeable_page(struct file *file,
924 loff_t pos, unsigned len, 914 loff_t pos, unsigned len,
@@ -931,8 +921,8 @@ static int ceph_update_writeable_page(struct file *file,
931 int pos_in_page = pos & ~PAGE_CACHE_MASK; 921 int pos_in_page = pos & ~PAGE_CACHE_MASK;
932 int end_in_page = pos_in_page + len; 922 int end_in_page = pos_in_page + len;
933 loff_t i_size; 923 loff_t i_size;
934 struct ceph_snap_context *snapc;
935 int r; 924 int r;
925 struct ceph_snap_context *snapc, *oldest;
936 926
937retry_locked: 927retry_locked:
938 /* writepages currently holds page lock, but if we change that later, */ 928 /* writepages currently holds page lock, but if we change that later, */
@@ -942,30 +932,34 @@ retry_locked:
942 BUG_ON(!ci->i_snap_realm); 932 BUG_ON(!ci->i_snap_realm);
943 down_read(&mdsc->snap_rwsem); 933 down_read(&mdsc->snap_rwsem);
944 BUG_ON(!ci->i_snap_realm->cached_context); 934 BUG_ON(!ci->i_snap_realm->cached_context);
945 if (page->private && 935 snapc = (void *)page->private;
946 (void *)page->private != ci->i_snap_realm->cached_context) { 936 if (snapc && snapc != ci->i_head_snapc) {
947 /* 937 /*
948 * this page is already dirty in another (older) snap 938 * this page is already dirty in another (older) snap
949 * context! is it writeable now? 939 * context! is it writeable now?
950 */ 940 */
951 snapc = get_oldest_context(inode, NULL); 941 oldest = get_oldest_context(inode, NULL);
952 up_read(&mdsc->snap_rwsem); 942 up_read(&mdsc->snap_rwsem);
953 943
954 if (snapc != (void *)page->private) { 944 if (snapc->seq > oldest->seq) {
945 ceph_put_snap_context(oldest);
955 dout(" page %p snapc %p not current or oldest\n", 946 dout(" page %p snapc %p not current or oldest\n",
956 page, (void *)page->private); 947 page, snapc);
957 /* 948 /*
958 * queue for writeback, and wait for snapc to 949 * queue for writeback, and wait for snapc to
959 * be writeable or written 950 * be writeable or written
960 */ 951 */
961 snapc = ceph_get_snap_context((void *)page->private); 952 snapc = ceph_get_snap_context(snapc);
962 unlock_page(page); 953 unlock_page(page);
963 ceph_queue_writeback(inode); 954 ceph_queue_writeback(inode);
964 wait_event_interruptible(ci->i_cap_wq, 955 r = wait_event_interruptible(ci->i_cap_wq,
965 context_is_writeable_or_written(inode, snapc)); 956 context_is_writeable_or_written(inode, snapc));
966 ceph_put_snap_context(snapc); 957 ceph_put_snap_context(snapc);
958 if (r == -ERESTARTSYS)
959 return r;
967 return -EAGAIN; 960 return -EAGAIN;
968 } 961 }
962 ceph_put_snap_context(oldest);
969 963
970 /* yay, writeable, do it now (without dropping page lock) */ 964 /* yay, writeable, do it now (without dropping page lock) */
971 dout(" page %p snapc %p not current, but oldest\n", 965 dout(" page %p snapc %p not current, but oldest\n",
@@ -1035,7 +1029,7 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
1035 int r; 1029 int r;
1036 1030
1037 do { 1031 do {
1038 /* get a page*/ 1032 /* get a page */
1039 page = grab_cache_page_write_begin(mapping, index, 0); 1033 page = grab_cache_page_write_begin(mapping, index, 0);
1040 if (!page) 1034 if (!page)
1041 return -ENOMEM; 1035 return -ENOMEM;
diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c
index abb204fea6c7..89490beaf537 100644
--- a/fs/ceph/auth.c
+++ b/fs/ceph/auth.c
@@ -2,6 +2,7 @@
2 2
3#include <linux/module.h> 3#include <linux/module.h>
4#include <linux/err.h> 4#include <linux/err.h>
5#include <linux/slab.h>
5 6
6#include "types.h" 7#include "types.h"
7#include "auth_none.h" 8#include "auth_none.h"
@@ -148,7 +149,8 @@ int ceph_build_auth_request(struct ceph_auth_client *ac,
148 149
149 ret = ac->ops->build_request(ac, p + sizeof(u32), end); 150 ret = ac->ops->build_request(ac, p + sizeof(u32), end);
150 if (ret < 0) { 151 if (ret < 0) {
151 pr_err("error %d building request\n", ret); 152 pr_err("error %d building auth method %s request\n", ret,
153 ac->ops->name);
152 return ret; 154 return ret;
153 } 155 }
154 dout(" built request %d bytes\n", ret); 156 dout(" built request %d bytes\n", ret);
@@ -227,7 +229,7 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
227 if (ret == -EAGAIN) { 229 if (ret == -EAGAIN) {
228 return ceph_build_auth_request(ac, reply_buf, reply_len); 230 return ceph_build_auth_request(ac, reply_buf, reply_len);
229 } else if (ret) { 231 } else if (ret) {
230 pr_err("authentication error %d\n", ret); 232 pr_err("auth method '%s' error %d\n", ac->ops->name, ret);
231 return ret; 233 return ret;
232 } 234 }
233 return 0; 235 return 0;
@@ -244,7 +246,7 @@ int ceph_build_auth(struct ceph_auth_client *ac,
244 if (!ac->protocol) 246 if (!ac->protocol)
245 return ceph_auth_build_hello(ac, msg_buf, msg_len); 247 return ceph_auth_build_hello(ac, msg_buf, msg_len);
246 BUG_ON(!ac->ops); 248 BUG_ON(!ac->ops);
247 if (!ac->ops->is_authenticated(ac)) 249 if (ac->ops->should_authenticate(ac))
248 return ceph_build_auth_request(ac, msg_buf, msg_len); 250 return ceph_build_auth_request(ac, msg_buf, msg_len);
249 return 0; 251 return 0;
250} 252}
diff --git a/fs/ceph/auth.h b/fs/ceph/auth.h
index ca4f57cfb267..d38a2fb4a137 100644
--- a/fs/ceph/auth.h
+++ b/fs/ceph/auth.h
@@ -15,6 +15,8 @@ struct ceph_auth_client;
15struct ceph_authorizer; 15struct ceph_authorizer;
16 16
17struct ceph_auth_client_ops { 17struct ceph_auth_client_ops {
18 const char *name;
19
18 /* 20 /*
19 * true if we are authenticated and can connect to 21 * true if we are authenticated and can connect to
20 * services. 22 * services.
@@ -22,6 +24,12 @@ struct ceph_auth_client_ops {
22 int (*is_authenticated)(struct ceph_auth_client *ac); 24 int (*is_authenticated)(struct ceph_auth_client *ac);
23 25
24 /* 26 /*
27 * true if we should (re)authenticate, e.g., when our tickets
28 * are getting old and crusty.
29 */
30 int (*should_authenticate)(struct ceph_auth_client *ac);
31
32 /*
25 * build requests and process replies during monitor 33 * build requests and process replies during monitor
26 * handshake. if handle_reply returns -EAGAIN, we build 34 * handshake. if handle_reply returns -EAGAIN, we build
27 * another request. 35 * another request.
diff --git a/fs/ceph/auth_none.c b/fs/ceph/auth_none.c
index b4ef6f0a6c85..ad1dc21286c7 100644
--- a/fs/ceph/auth_none.c
+++ b/fs/ceph/auth_none.c
@@ -4,6 +4,7 @@
4#include <linux/err.h> 4#include <linux/err.h>
5#include <linux/module.h> 5#include <linux/module.h>
6#include <linux/random.h> 6#include <linux/random.h>
7#include <linux/slab.h>
7 8
8#include "auth_none.h" 9#include "auth_none.h"
9#include "auth.h" 10#include "auth.h"
@@ -30,6 +31,13 @@ static int is_authenticated(struct ceph_auth_client *ac)
30 return !xi->starting; 31 return !xi->starting;
31} 32}
32 33
34static int should_authenticate(struct ceph_auth_client *ac)
35{
36 struct ceph_auth_none_info *xi = ac->private;
37
38 return xi->starting;
39}
40
33/* 41/*
34 * the generic auth code decode the global_id, and we carry no actual 42 * the generic auth code decode the global_id, and we carry no actual
35 * authenticate state, so nothing happens here. 43 * authenticate state, so nothing happens here.
@@ -93,9 +101,11 @@ static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac,
93} 101}
94 102
95static const struct ceph_auth_client_ops ceph_auth_none_ops = { 103static const struct ceph_auth_client_ops ceph_auth_none_ops = {
104 .name = "none",
96 .reset = reset, 105 .reset = reset,
97 .destroy = destroy, 106 .destroy = destroy,
98 .is_authenticated = is_authenticated, 107 .is_authenticated = is_authenticated,
108 .should_authenticate = should_authenticate,
99 .handle_reply = handle_reply, 109 .handle_reply = handle_reply,
100 .create_authorizer = ceph_auth_none_create_authorizer, 110 .create_authorizer = ceph_auth_none_create_authorizer,
101 .destroy_authorizer = ceph_auth_none_destroy_authorizer, 111 .destroy_authorizer = ceph_auth_none_destroy_authorizer,
diff --git a/fs/ceph/auth_none.h b/fs/ceph/auth_none.h
index 56c05533a31c..8164df1a08be 100644
--- a/fs/ceph/auth_none.h
+++ b/fs/ceph/auth_none.h
@@ -1,6 +1,8 @@
1#ifndef _FS_CEPH_AUTH_NONE_H 1#ifndef _FS_CEPH_AUTH_NONE_H
2#define _FS_CEPH_AUTH_NONE_H 2#define _FS_CEPH_AUTH_NONE_H
3 3
4#include <linux/slab.h>
5
4#include "auth.h" 6#include "auth.h"
5 7
6/* 8/*
diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
index f0318427b6da..83d4d2785ffe 100644
--- a/fs/ceph/auth_x.c
+++ b/fs/ceph/auth_x.c
@@ -4,6 +4,7 @@
4#include <linux/err.h> 4#include <linux/err.h>
5#include <linux/module.h> 5#include <linux/module.h>
6#include <linux/random.h> 6#include <linux/random.h>
7#include <linux/slab.h>
7 8
8#include "auth_x.h" 9#include "auth_x.h"
9#include "auth_x_protocol.h" 10#include "auth_x_protocol.h"
@@ -11,8 +12,6 @@
11#include "auth.h" 12#include "auth.h"
12#include "decode.h" 13#include "decode.h"
13 14
14struct kmem_cache *ceph_x_ticketbuf_cachep;
15
16#define TEMP_TICKET_BUF_LEN 256 15#define TEMP_TICKET_BUF_LEN 256
17 16
18static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed); 17static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed);
@@ -28,6 +27,23 @@ static int ceph_x_is_authenticated(struct ceph_auth_client *ac)
28 return (ac->want_keys & xi->have_keys) == ac->want_keys; 27 return (ac->want_keys & xi->have_keys) == ac->want_keys;
29} 28}
30 29
30static int ceph_x_should_authenticate(struct ceph_auth_client *ac)
31{
32 struct ceph_x_info *xi = ac->private;
33 int need;
34
35 ceph_x_validate_tickets(ac, &need);
36 dout("ceph_x_should_authenticate want=%d need=%d have=%d\n",
37 ac->want_keys, need, xi->have_keys);
38 return need != 0;
39}
40
41static int ceph_x_encrypt_buflen(int ilen)
42{
43 return sizeof(struct ceph_x_encrypt_header) + ilen + 16 +
44 sizeof(u32);
45}
46
31static int ceph_x_encrypt(struct ceph_crypto_key *secret, 47static int ceph_x_encrypt(struct ceph_crypto_key *secret,
32 void *ibuf, int ilen, void *obuf, size_t olen) 48 void *ibuf, int ilen, void *obuf, size_t olen)
33{ 49{
@@ -122,27 +138,26 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
122 int ret; 138 int ret;
123 char *dbuf; 139 char *dbuf;
124 char *ticket_buf; 140 char *ticket_buf;
125 u8 struct_v; 141 u8 reply_struct_v;
126 142
127 dbuf = kmem_cache_alloc(ceph_x_ticketbuf_cachep, GFP_NOFS | GFP_ATOMIC); 143 dbuf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
128 if (!dbuf) 144 if (!dbuf)
129 return -ENOMEM; 145 return -ENOMEM;
130 146
131 ret = -ENOMEM; 147 ret = -ENOMEM;
132 ticket_buf = kmem_cache_alloc(ceph_x_ticketbuf_cachep, 148 ticket_buf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
133 GFP_NOFS | GFP_ATOMIC);
134 if (!ticket_buf) 149 if (!ticket_buf)
135 goto out_dbuf; 150 goto out_dbuf;
136 151
137 ceph_decode_need(&p, end, 1 + sizeof(u32), bad); 152 ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
138 struct_v = ceph_decode_8(&p); 153 reply_struct_v = ceph_decode_8(&p);
139 if (struct_v != 1) 154 if (reply_struct_v != 1)
140 goto bad; 155 goto bad;
141 num = ceph_decode_32(&p); 156 num = ceph_decode_32(&p);
142 dout("%d tickets\n", num); 157 dout("%d tickets\n", num);
143 while (num--) { 158 while (num--) {
144 int type; 159 int type;
145 u8 struct_v; 160 u8 tkt_struct_v, blob_struct_v;
146 struct ceph_x_ticket_handler *th; 161 struct ceph_x_ticket_handler *th;
147 void *dp, *dend; 162 void *dp, *dend;
148 int dlen; 163 int dlen;
@@ -150,14 +165,19 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
150 struct timespec validity; 165 struct timespec validity;
151 struct ceph_crypto_key old_key; 166 struct ceph_crypto_key old_key;
152 void *tp, *tpend; 167 void *tp, *tpend;
168 struct ceph_timespec new_validity;
169 struct ceph_crypto_key new_session_key;
170 struct ceph_buffer *new_ticket_blob;
171 unsigned long new_expires, new_renew_after;
172 u64 new_secret_id;
153 173
154 ceph_decode_need(&p, end, sizeof(u32) + 1, bad); 174 ceph_decode_need(&p, end, sizeof(u32) + 1, bad);
155 175
156 type = ceph_decode_32(&p); 176 type = ceph_decode_32(&p);
157 dout(" ticket type %d %s\n", type, ceph_entity_type_name(type)); 177 dout(" ticket type %d %s\n", type, ceph_entity_type_name(type));
158 178
159 struct_v = ceph_decode_8(&p); 179 tkt_struct_v = ceph_decode_8(&p);
160 if (struct_v != 1) 180 if (tkt_struct_v != 1)
161 goto bad; 181 goto bad;
162 182
163 th = get_ticket_handler(ac, type); 183 th = get_ticket_handler(ac, type);
@@ -177,21 +197,21 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
177 dend = dbuf + dlen; 197 dend = dbuf + dlen;
178 dp = dbuf; 198 dp = dbuf;
179 199
180 struct_v = ceph_decode_8(&dp); 200 tkt_struct_v = ceph_decode_8(&dp);
181 if (struct_v != 1) 201 if (tkt_struct_v != 1)
182 goto bad; 202 goto bad;
183 203
184 memcpy(&old_key, &th->session_key, sizeof(old_key)); 204 memcpy(&old_key, &th->session_key, sizeof(old_key));
185 ret = ceph_crypto_key_decode(&th->session_key, &dp, dend); 205 ret = ceph_crypto_key_decode(&new_session_key, &dp, dend);
186 if (ret) 206 if (ret)
187 goto out; 207 goto out;
188 208
189 ceph_decode_copy(&dp, &th->validity, sizeof(th->validity)); 209 ceph_decode_copy(&dp, &new_validity, sizeof(new_validity));
190 ceph_decode_timespec(&validity, &th->validity); 210 ceph_decode_timespec(&validity, &new_validity);
191 th->expires = get_seconds() + validity.tv_sec; 211 new_expires = get_seconds() + validity.tv_sec;
192 th->renew_after = th->expires - (validity.tv_sec / 4); 212 new_renew_after = new_expires - (validity.tv_sec / 4);
193 dout(" expires=%lu renew_after=%lu\n", th->expires, 213 dout(" expires=%lu renew_after=%lu\n", new_expires,
194 th->renew_after); 214 new_renew_after);
195 215
196 /* ticket blob for service */ 216 /* ticket blob for service */
197 ceph_decode_8_safe(&p, end, is_enc, bad); 217 ceph_decode_8_safe(&p, end, is_enc, bad);
@@ -215,11 +235,22 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
215 tpend = tp + dlen; 235 tpend = tp + dlen;
216 dout(" ticket blob is %d bytes\n", dlen); 236 dout(" ticket blob is %d bytes\n", dlen);
217 ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad); 237 ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad);
218 struct_v = ceph_decode_8(&tp); 238 blob_struct_v = ceph_decode_8(&tp);
219 th->secret_id = ceph_decode_64(&tp); 239 new_secret_id = ceph_decode_64(&tp);
220 ret = ceph_decode_buffer(&th->ticket_blob, &tp, tpend); 240 ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend);
221 if (ret) 241 if (ret)
222 goto out; 242 goto out;
243
244 /* all is well, update our ticket */
245 ceph_crypto_key_destroy(&th->session_key);
246 if (th->ticket_blob)
247 ceph_buffer_put(th->ticket_blob);
248 th->session_key = new_session_key;
249 th->ticket_blob = new_ticket_blob;
250 th->validity = new_validity;
251 th->secret_id = new_secret_id;
252 th->expires = new_expires;
253 th->renew_after = new_renew_after;
223 dout(" got ticket service %d (%s) secret_id %lld len %d\n", 254 dout(" got ticket service %d (%s) secret_id %lld len %d\n",
224 type, ceph_entity_type_name(type), th->secret_id, 255 type, ceph_entity_type_name(type), th->secret_id,
225 (int)th->ticket_blob->vec.iov_len); 256 (int)th->ticket_blob->vec.iov_len);
@@ -228,9 +259,9 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
228 259
229 ret = 0; 260 ret = 0;
230out: 261out:
231 kmem_cache_free(ceph_x_ticketbuf_cachep, ticket_buf); 262 kfree(ticket_buf);
232out_dbuf: 263out_dbuf:
233 kmem_cache_free(ceph_x_ticketbuf_cachep, dbuf); 264 kfree(dbuf);
234 return ret; 265 return ret;
235 266
236bad: 267bad:
@@ -242,7 +273,7 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
242 struct ceph_x_ticket_handler *th, 273 struct ceph_x_ticket_handler *th,
243 struct ceph_x_authorizer *au) 274 struct ceph_x_authorizer *au)
244{ 275{
245 int len; 276 int maxlen;
246 struct ceph_x_authorize_a *msg_a; 277 struct ceph_x_authorize_a *msg_a;
247 struct ceph_x_authorize_b msg_b; 278 struct ceph_x_authorize_b msg_b;
248 void *p, *end; 279 void *p, *end;
@@ -253,15 +284,15 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
253 dout("build_authorizer for %s %p\n", 284 dout("build_authorizer for %s %p\n",
254 ceph_entity_type_name(th->service), au); 285 ceph_entity_type_name(th->service), au);
255 286
256 len = sizeof(*msg_a) + sizeof(msg_b) + sizeof(u32) + 287 maxlen = sizeof(*msg_a) + sizeof(msg_b) +
257 ticket_blob_len + 16; 288 ceph_x_encrypt_buflen(ticket_blob_len);
258 dout(" need len %d\n", len); 289 dout(" need len %d\n", maxlen);
259 if (au->buf && au->buf->alloc_len < len) { 290 if (au->buf && au->buf->alloc_len < maxlen) {
260 ceph_buffer_put(au->buf); 291 ceph_buffer_put(au->buf);
261 au->buf = NULL; 292 au->buf = NULL;
262 } 293 }
263 if (!au->buf) { 294 if (!au->buf) {
264 au->buf = ceph_buffer_new(len, GFP_NOFS); 295 au->buf = ceph_buffer_new(maxlen, GFP_NOFS);
265 if (!au->buf) 296 if (!au->buf)
266 return -ENOMEM; 297 return -ENOMEM;
267 } 298 }
@@ -296,6 +327,7 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
296 au->buf->vec.iov_len = p - au->buf->vec.iov_base; 327 au->buf->vec.iov_len = p - au->buf->vec.iov_base;
297 dout(" built authorizer nonce %llx len %d\n", au->nonce, 328 dout(" built authorizer nonce %llx len %d\n", au->nonce,
298 (int)au->buf->vec.iov_len); 329 (int)au->buf->vec.iov_len);
330 BUG_ON(au->buf->vec.iov_len > maxlen);
299 return 0; 331 return 0;
300 332
301out_buf: 333out_buf:
@@ -581,8 +613,6 @@ static void ceph_x_destroy(struct ceph_auth_client *ac)
581 remove_ticket_handler(ac, th); 613 remove_ticket_handler(ac, th);
582 } 614 }
583 615
584 kmem_cache_destroy(ceph_x_ticketbuf_cachep);
585
586 kfree(ac->private); 616 kfree(ac->private);
587 ac->private = NULL; 617 ac->private = NULL;
588} 618}
@@ -599,7 +629,9 @@ static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
599 629
600 630
601static const struct ceph_auth_client_ops ceph_x_ops = { 631static const struct ceph_auth_client_ops ceph_x_ops = {
632 .name = "x",
602 .is_authenticated = ceph_x_is_authenticated, 633 .is_authenticated = ceph_x_is_authenticated,
634 .should_authenticate = ceph_x_should_authenticate,
603 .build_request = ceph_x_build_request, 635 .build_request = ceph_x_build_request,
604 .handle_reply = ceph_x_handle_reply, 636 .handle_reply = ceph_x_handle_reply,
605 .create_authorizer = ceph_x_create_authorizer, 637 .create_authorizer = ceph_x_create_authorizer,
@@ -617,26 +649,20 @@ int ceph_x_init(struct ceph_auth_client *ac)
617 int ret; 649 int ret;
618 650
619 dout("ceph_x_init %p\n", ac); 651 dout("ceph_x_init %p\n", ac);
652 ret = -ENOMEM;
620 xi = kzalloc(sizeof(*xi), GFP_NOFS); 653 xi = kzalloc(sizeof(*xi), GFP_NOFS);
621 if (!xi) 654 if (!xi)
622 return -ENOMEM; 655 goto out;
623 656
624 ret = -ENOMEM;
625 ceph_x_ticketbuf_cachep = kmem_cache_create("ceph_x_ticketbuf",
626 TEMP_TICKET_BUF_LEN, 8,
627 (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
628 NULL);
629 if (!ceph_x_ticketbuf_cachep)
630 goto done_nomem;
631 ret = -EINVAL; 657 ret = -EINVAL;
632 if (!ac->secret) { 658 if (!ac->secret) {
633 pr_err("no secret set (for auth_x protocol)\n"); 659 pr_err("no secret set (for auth_x protocol)\n");
634 goto done_nomem; 660 goto out_nomem;
635 } 661 }
636 662
637 ret = ceph_crypto_key_unarmor(&xi->secret, ac->secret); 663 ret = ceph_crypto_key_unarmor(&xi->secret, ac->secret);
638 if (ret) 664 if (ret)
639 goto done_nomem; 665 goto out_nomem;
640 666
641 xi->starting = true; 667 xi->starting = true;
642 xi->ticket_handlers = RB_ROOT; 668 xi->ticket_handlers = RB_ROOT;
@@ -646,10 +672,9 @@ int ceph_x_init(struct ceph_auth_client *ac)
646 ac->ops = &ceph_x_ops; 672 ac->ops = &ceph_x_ops;
647 return 0; 673 return 0;
648 674
649done_nomem: 675out_nomem:
650 kfree(xi); 676 kfree(xi);
651 if (ceph_x_ticketbuf_cachep) 677out:
652 kmem_cache_destroy(ceph_x_ticketbuf_cachep);
653 return ret; 678 return ret;
654} 679}
655 680
diff --git a/fs/ceph/buffer.c b/fs/ceph/buffer.c
index b98086c7aeba..c67535d70aa6 100644
--- a/fs/ceph/buffer.c
+++ b/fs/ceph/buffer.c
@@ -1,5 +1,8 @@
1 1
2#include "ceph_debug.h" 2#include "ceph_debug.h"
3
4#include <linux/slab.h>
5
3#include "buffer.h" 6#include "buffer.h"
4#include "decode.h" 7#include "decode.h"
5 8
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index db122bb357b8..ae3e3a306445 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -3,6 +3,7 @@
3#include <linux/fs.h> 3#include <linux/fs.h>
4#include <linux/kernel.h> 4#include <linux/kernel.h>
5#include <linux/sched.h> 5#include <linux/sched.h>
6#include <linux/slab.h>
6#include <linux/vmalloc.h> 7#include <linux/vmalloc.h>
7#include <linux/wait.h> 8#include <linux/wait.h>
8#include <linux/writeback.h> 9#include <linux/writeback.h>
@@ -857,6 +858,8 @@ static int __ceph_is_any_caps(struct ceph_inode_info *ci)
857} 858}
858 859
859/* 860/*
861 * Remove a cap. Take steps to deal with a racing iterate_session_caps.
862 *
860 * caller should hold i_lock. 863 * caller should hold i_lock.
861 * caller will not hold session s_mutex if called from destroy_inode. 864 * caller will not hold session s_mutex if called from destroy_inode.
862 */ 865 */
@@ -864,16 +867,12 @@ void __ceph_remove_cap(struct ceph_cap *cap)
864{ 867{
865 struct ceph_mds_session *session = cap->session; 868 struct ceph_mds_session *session = cap->session;
866 struct ceph_inode_info *ci = cap->ci; 869 struct ceph_inode_info *ci = cap->ci;
867 struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc; 870 struct ceph_mds_client *mdsc =
871 &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
872 int removed = 0;
868 873
869 dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode); 874 dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
870 875
871 /* remove from inode list */
872 rb_erase(&cap->ci_node, &ci->i_caps);
873 cap->ci = NULL;
874 if (ci->i_auth_cap == cap)
875 ci->i_auth_cap = NULL;
876
877 /* remove from session list */ 876 /* remove from session list */
878 spin_lock(&session->s_cap_lock); 877 spin_lock(&session->s_cap_lock);
879 if (session->s_cap_iterator == cap) { 878 if (session->s_cap_iterator == cap) {
@@ -884,10 +883,18 @@ void __ceph_remove_cap(struct ceph_cap *cap)
884 list_del_init(&cap->session_caps); 883 list_del_init(&cap->session_caps);
885 session->s_nr_caps--; 884 session->s_nr_caps--;
886 cap->session = NULL; 885 cap->session = NULL;
886 removed = 1;
887 } 887 }
888 /* protect backpointer with s_cap_lock: see iterate_session_caps */
889 cap->ci = NULL;
888 spin_unlock(&session->s_cap_lock); 890 spin_unlock(&session->s_cap_lock);
889 891
890 if (cap->session == NULL) 892 /* remove from inode list */
893 rb_erase(&cap->ci_node, &ci->i_caps);
894 if (ci->i_auth_cap == cap)
895 ci->i_auth_cap = NULL;
896
897 if (removed)
891 ceph_put_cap(cap); 898 ceph_put_cap(cap);
892 899
893 if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) { 900 if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) {
@@ -931,9 +938,9 @@ static int send_cap_msg(struct ceph_mds_session *session,
931 seq, issue_seq, mseq, follows, size, max_size, 938 seq, issue_seq, mseq, follows, size, max_size,
932 xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0); 939 xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0);
933 940
934 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), 0, 0, NULL); 941 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS);
935 if (IS_ERR(msg)) 942 if (!msg)
936 return PTR_ERR(msg); 943 return -ENOMEM;
937 944
938 msg->hdr.tid = cpu_to_le64(flush_tid); 945 msg->hdr.tid = cpu_to_le64(flush_tid);
939 946
@@ -1204,6 +1211,12 @@ retry:
1204 if (capsnap->dirty_pages || capsnap->writing) 1211 if (capsnap->dirty_pages || capsnap->writing)
1205 continue; 1212 continue;
1206 1213
1214 /*
1215 * if cap writeback already occurred, we should have dropped
1216 * the capsnap in ceph_put_wrbuffer_cap_refs.
1217 */
1218 BUG_ON(capsnap->dirty == 0);
1219
1207 /* pick mds, take s_mutex */ 1220 /* pick mds, take s_mutex */
1208 mds = __ceph_get_cap_mds(ci, &mseq); 1221 mds = __ceph_get_cap_mds(ci, &mseq);
1209 if (session && session->s_mds != mds) { 1222 if (session && session->s_mds != mds) {
@@ -1286,7 +1299,8 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci)
1286 */ 1299 */
1287void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) 1300void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
1288{ 1301{
1289 struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc; 1302 struct ceph_mds_client *mdsc =
1303 &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
1290 struct inode *inode = &ci->vfs_inode; 1304 struct inode *inode = &ci->vfs_inode;
1291 int was = ci->i_dirty_caps; 1305 int was = ci->i_dirty_caps;
1292 int dirty = 0; 1306 int dirty = 0;
@@ -1324,7 +1338,7 @@ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
1324static int __mark_caps_flushing(struct inode *inode, 1338static int __mark_caps_flushing(struct inode *inode,
1325 struct ceph_mds_session *session) 1339 struct ceph_mds_session *session)
1326{ 1340{
1327 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc; 1341 struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
1328 struct ceph_inode_info *ci = ceph_inode(inode); 1342 struct ceph_inode_info *ci = ceph_inode(inode);
1329 int flushing; 1343 int flushing;
1330 1344
@@ -1407,6 +1421,7 @@ static int try_nonblocking_invalidate(struct inode *inode)
1407 */ 1421 */
1408void ceph_check_caps(struct ceph_inode_info *ci, int flags, 1422void ceph_check_caps(struct ceph_inode_info *ci, int flags,
1409 struct ceph_mds_session *session) 1423 struct ceph_mds_session *session)
1424 __releases(session->s_mutex)
1410{ 1425{
1411 struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode); 1426 struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode);
1412 struct ceph_mds_client *mdsc = &client->mdsc; 1427 struct ceph_mds_client *mdsc = &client->mdsc;
@@ -1414,7 +1429,6 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
1414 struct ceph_cap *cap; 1429 struct ceph_cap *cap;
1415 int file_wanted, used; 1430 int file_wanted, used;
1416 int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */ 1431 int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */
1417 int drop_session_lock = session ? 0 : 1;
1418 int issued, implemented, want, retain, revoking, flushing = 0; 1432 int issued, implemented, want, retain, revoking, flushing = 0;
1419 int mds = -1; /* keep track of how far we've gone through i_caps list 1433 int mds = -1; /* keep track of how far we've gone through i_caps list
1420 to avoid an infinite loop on retry */ 1434 to avoid an infinite loop on retry */
@@ -1639,7 +1653,7 @@ ack:
1639 if (queue_invalidate) 1653 if (queue_invalidate)
1640 ceph_queue_invalidate(inode); 1654 ceph_queue_invalidate(inode);
1641 1655
1642 if (session && drop_session_lock) 1656 if (session)
1643 mutex_unlock(&session->s_mutex); 1657 mutex_unlock(&session->s_mutex);
1644 if (took_snap_rwsem) 1658 if (took_snap_rwsem)
1645 up_read(&mdsc->snap_rwsem); 1659 up_read(&mdsc->snap_rwsem);
@@ -1651,7 +1665,7 @@ ack:
1651static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session, 1665static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
1652 unsigned *flush_tid) 1666 unsigned *flush_tid)
1653{ 1667{
1654 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc; 1668 struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
1655 struct ceph_inode_info *ci = ceph_inode(inode); 1669 struct ceph_inode_info *ci = ceph_inode(inode);
1656 int unlock_session = session ? 0 : 1; 1670 int unlock_session = session ? 0 : 1;
1657 int flushing = 0; 1671 int flushing = 0;
@@ -1704,10 +1718,9 @@ out_unlocked:
1704static int caps_are_flushed(struct inode *inode, unsigned tid) 1718static int caps_are_flushed(struct inode *inode, unsigned tid)
1705{ 1719{
1706 struct ceph_inode_info *ci = ceph_inode(inode); 1720 struct ceph_inode_info *ci = ceph_inode(inode);
1707 int dirty, i, ret = 1; 1721 int i, ret = 1;
1708 1722
1709 spin_lock(&inode->i_lock); 1723 spin_lock(&inode->i_lock);
1710 dirty = __ceph_caps_dirty(ci);
1711 for (i = 0; i < CEPH_CAP_BITS; i++) 1724 for (i = 0; i < CEPH_CAP_BITS; i++)
1712 if ((ci->i_flushing_caps & (1 << i)) && 1725 if ((ci->i_flushing_caps & (1 << i)) &&
1713 ci->i_cap_flush_tid[i] <= tid) { 1726 ci->i_cap_flush_tid[i] <= tid) {
@@ -1763,9 +1776,9 @@ out:
1763 spin_unlock(&ci->i_unsafe_lock); 1776 spin_unlock(&ci->i_unsafe_lock);
1764} 1777}
1765 1778
1766int ceph_fsync(struct file *file, struct dentry *dentry, int datasync) 1779int ceph_fsync(struct file *file, int datasync)
1767{ 1780{
1768 struct inode *inode = dentry->d_inode; 1781 struct inode *inode = file->f_mapping->host;
1769 struct ceph_inode_info *ci = ceph_inode(inode); 1782 struct ceph_inode_info *ci = ceph_inode(inode);
1770 unsigned flush_tid; 1783 unsigned flush_tid;
1771 int ret; 1784 int ret;
@@ -1817,7 +1830,8 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
1817 err = wait_event_interruptible(ci->i_cap_wq, 1830 err = wait_event_interruptible(ci->i_cap_wq,
1818 caps_are_flushed(inode, flush_tid)); 1831 caps_are_flushed(inode, flush_tid));
1819 } else { 1832 } else {
1820 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc; 1833 struct ceph_mds_client *mdsc =
1834 &ceph_sb_to_client(inode->i_sb)->mdsc;
1821 1835
1822 spin_lock(&inode->i_lock); 1836 spin_lock(&inode->i_lock);
1823 if (__ceph_caps_dirty(ci)) 1837 if (__ceph_caps_dirty(ci))
@@ -1854,8 +1868,8 @@ static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
1854 } else { 1868 } else {
1855 pr_err("%p auth cap %p not mds%d ???\n", inode, 1869 pr_err("%p auth cap %p not mds%d ???\n", inode,
1856 cap, session->s_mds); 1870 cap, session->s_mds);
1857 spin_unlock(&inode->i_lock);
1858 } 1871 }
1872 spin_unlock(&inode->i_lock);
1859 } 1873 }
1860} 1874}
1861 1875
@@ -2117,8 +2131,8 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
2117 } 2131 }
2118 spin_unlock(&inode->i_lock); 2132 spin_unlock(&inode->i_lock);
2119 2133
2120 dout("put_cap_refs %p had %s %s\n", inode, ceph_cap_string(had), 2134 dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had),
2121 last ? "last" : ""); 2135 last ? " last" : "", put ? " put" : "");
2122 2136
2123 if (last && !flushsnaps) 2137 if (last && !flushsnaps)
2124 ceph_check_caps(ci, 0, NULL); 2138 ceph_check_caps(ci, 0, NULL);
@@ -2142,7 +2156,8 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
2142{ 2156{
2143 struct inode *inode = &ci->vfs_inode; 2157 struct inode *inode = &ci->vfs_inode;
2144 int last = 0; 2158 int last = 0;
2145 int last_snap = 0; 2159 int complete_capsnap = 0;
2160 int drop_capsnap = 0;
2146 int found = 0; 2161 int found = 0;
2147 struct ceph_cap_snap *capsnap = NULL; 2162 struct ceph_cap_snap *capsnap = NULL;
2148 2163
@@ -2165,19 +2180,32 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
2165 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { 2180 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
2166 if (capsnap->context == snapc) { 2181 if (capsnap->context == snapc) {
2167 found = 1; 2182 found = 1;
2168 capsnap->dirty_pages -= nr;
2169 last_snap = !capsnap->dirty_pages;
2170 break; 2183 break;
2171 } 2184 }
2172 } 2185 }
2173 BUG_ON(!found); 2186 BUG_ON(!found);
2187 capsnap->dirty_pages -= nr;
2188 if (capsnap->dirty_pages == 0) {
2189 complete_capsnap = 1;
2190 if (capsnap->dirty == 0)
2191 /* cap writeback completed before we created
2192 * the cap_snap; no FLUSHSNAP is needed */
2193 drop_capsnap = 1;
2194 }
2174 dout("put_wrbuffer_cap_refs on %p cap_snap %p " 2195 dout("put_wrbuffer_cap_refs on %p cap_snap %p "
2175 " snap %lld %d/%d -> %d/%d %s%s\n", 2196 " snap %lld %d/%d -> %d/%d %s%s%s\n",
2176 inode, capsnap, capsnap->context->seq, 2197 inode, capsnap, capsnap->context->seq,
2177 ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr, 2198 ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr,
2178 ci->i_wrbuffer_ref, capsnap->dirty_pages, 2199 ci->i_wrbuffer_ref, capsnap->dirty_pages,
2179 last ? " (wrbuffer last)" : "", 2200 last ? " (wrbuffer last)" : "",
2180 last_snap ? " (capsnap last)" : ""); 2201 complete_capsnap ? " (complete capsnap)" : "",
2202 drop_capsnap ? " (drop capsnap)" : "");
2203 if (drop_capsnap) {
2204 ceph_put_snap_context(capsnap->context);
2205 list_del(&capsnap->ci_item);
2206 list_del(&capsnap->flushing_item);
2207 ceph_put_cap_snap(capsnap);
2208 }
2181 } 2209 }
2182 2210
2183 spin_unlock(&inode->i_lock); 2211 spin_unlock(&inode->i_lock);
@@ -2185,28 +2213,31 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
2185 if (last) { 2213 if (last) {
2186 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); 2214 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
2187 iput(inode); 2215 iput(inode);
2188 } else if (last_snap) { 2216 } else if (complete_capsnap) {
2189 ceph_flush_snaps(ci); 2217 ceph_flush_snaps(ci);
2190 wake_up(&ci->i_cap_wq); 2218 wake_up(&ci->i_cap_wq);
2191 } 2219 }
2220 if (drop_capsnap)
2221 iput(inode);
2192} 2222}
2193 2223
2194/* 2224/*
2195 * Handle a cap GRANT message from the MDS. (Note that a GRANT may 2225 * Handle a cap GRANT message from the MDS. (Note that a GRANT may
2196 * actually be a revocation if it specifies a smaller cap set.) 2226 * actually be a revocation if it specifies a smaller cap set.)
2197 * 2227 *
2198 * caller holds s_mutex. 2228 * caller holds s_mutex and i_lock, we drop both.
2229 *
2199 * return value: 2230 * return value:
2200 * 0 - ok 2231 * 0 - ok
2201 * 1 - check_caps on auth cap only (writeback) 2232 * 1 - check_caps on auth cap only (writeback)
2202 * 2 - check_caps (ack revoke) 2233 * 2 - check_caps (ack revoke)
2203 */ 2234 */
2204static int handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, 2235static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2205 struct ceph_mds_session *session, 2236 struct ceph_mds_session *session,
2206 struct ceph_cap *cap, 2237 struct ceph_cap *cap,
2207 struct ceph_buffer *xattr_buf) 2238 struct ceph_buffer *xattr_buf)
2208 __releases(inode->i_lock) 2239 __releases(inode->i_lock)
2209 2240 __releases(session->s_mutex)
2210{ 2241{
2211 struct ceph_inode_info *ci = ceph_inode(inode); 2242 struct ceph_inode_info *ci = ceph_inode(inode);
2212 int mds = session->s_mds; 2243 int mds = session->s_mds;
@@ -2216,7 +2247,7 @@ static int handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2216 u64 size = le64_to_cpu(grant->size); 2247 u64 size = le64_to_cpu(grant->size);
2217 u64 max_size = le64_to_cpu(grant->max_size); 2248 u64 max_size = le64_to_cpu(grant->max_size);
2218 struct timespec mtime, atime, ctime; 2249 struct timespec mtime, atime, ctime;
2219 int reply = 0; 2250 int check_caps = 0;
2220 int wake = 0; 2251 int wake = 0;
2221 int writeback = 0; 2252 int writeback = 0;
2222 int revoked_rdcache = 0; 2253 int revoked_rdcache = 0;
@@ -2329,11 +2360,12 @@ static int handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2329 if ((used & ~newcaps) & CEPH_CAP_FILE_BUFFER) 2360 if ((used & ~newcaps) & CEPH_CAP_FILE_BUFFER)
2330 writeback = 1; /* will delay ack */ 2361 writeback = 1; /* will delay ack */
2331 else if (dirty & ~newcaps) 2362 else if (dirty & ~newcaps)
2332 reply = 1; /* initiate writeback in check_caps */ 2363 check_caps = 1; /* initiate writeback in check_caps */
2333 else if (((used & ~newcaps) & CEPH_CAP_FILE_CACHE) == 0 || 2364 else if (((used & ~newcaps) & CEPH_CAP_FILE_CACHE) == 0 ||
2334 revoked_rdcache) 2365 revoked_rdcache)
2335 reply = 2; /* send revoke ack in check_caps */ 2366 check_caps = 2; /* send revoke ack in check_caps */
2336 cap->issued = newcaps; 2367 cap->issued = newcaps;
2368 cap->implemented |= newcaps;
2337 } else if (cap->issued == newcaps) { 2369 } else if (cap->issued == newcaps) {
2338 dout("caps unchanged: %s -> %s\n", 2370 dout("caps unchanged: %s -> %s\n",
2339 ceph_cap_string(cap->issued), ceph_cap_string(newcaps)); 2371 ceph_cap_string(cap->issued), ceph_cap_string(newcaps));
@@ -2346,6 +2378,7 @@ static int handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2346 * pending revocation */ 2378 * pending revocation */
2347 wake = 1; 2379 wake = 1;
2348 } 2380 }
2381 BUG_ON(cap->issued & ~cap->implemented);
2349 2382
2350 spin_unlock(&inode->i_lock); 2383 spin_unlock(&inode->i_lock);
2351 if (writeback) 2384 if (writeback)
@@ -2359,7 +2392,14 @@ static int handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2359 ceph_queue_invalidate(inode); 2392 ceph_queue_invalidate(inode);
2360 if (wake) 2393 if (wake)
2361 wake_up(&ci->i_cap_wq); 2394 wake_up(&ci->i_cap_wq);
2362 return reply; 2395
2396 if (check_caps == 1)
2397 ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY,
2398 session);
2399 else if (check_caps == 2)
2400 ceph_check_caps(ci, CHECK_CAPS_NODELAY, session);
2401 else
2402 mutex_unlock(&session->s_mutex);
2363} 2403}
2364 2404
2365/* 2405/*
@@ -2373,7 +2413,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
2373 __releases(inode->i_lock) 2413 __releases(inode->i_lock)
2374{ 2414{
2375 struct ceph_inode_info *ci = ceph_inode(inode); 2415 struct ceph_inode_info *ci = ceph_inode(inode);
2376 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc; 2416 struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
2377 unsigned seq = le32_to_cpu(m->seq); 2417 unsigned seq = le32_to_cpu(m->seq);
2378 int dirty = le32_to_cpu(m->dirty); 2418 int dirty = le32_to_cpu(m->dirty);
2379 int cleaned = 0; 2419 int cleaned = 0;
@@ -2454,8 +2494,8 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
2454 break; 2494 break;
2455 } 2495 }
2456 WARN_ON(capsnap->dirty_pages || capsnap->writing); 2496 WARN_ON(capsnap->dirty_pages || capsnap->writing);
2457 dout(" removing cap_snap %p follows %lld\n", 2497 dout(" removing %p cap_snap %p follows %lld\n",
2458 capsnap, follows); 2498 inode, capsnap, follows);
2459 ceph_put_snap_context(capsnap->context); 2499 ceph_put_snap_context(capsnap->context);
2460 list_del(&capsnap->ci_item); 2500 list_del(&capsnap->ci_item);
2461 list_del(&capsnap->flushing_item); 2501 list_del(&capsnap->flushing_item);
@@ -2548,9 +2588,8 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
2548 ci->i_cap_exporting_issued = cap->issued; 2588 ci->i_cap_exporting_issued = cap->issued;
2549 } 2589 }
2550 __ceph_remove_cap(cap); 2590 __ceph_remove_cap(cap);
2551 } else {
2552 WARN_ON(!cap);
2553 } 2591 }
2592 /* else, we already released it */
2554 2593
2555 spin_unlock(&inode->i_lock); 2594 spin_unlock(&inode->i_lock);
2556} 2595}
@@ -2621,9 +2660,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2621 u64 cap_id; 2660 u64 cap_id;
2622 u64 size, max_size; 2661 u64 size, max_size;
2623 u64 tid; 2662 u64 tid;
2624 int check_caps = 0;
2625 void *snaptrace; 2663 void *snaptrace;
2626 int r;
2627 2664
2628 dout("handle_caps from mds%d\n", mds); 2665 dout("handle_caps from mds%d\n", mds);
2629 2666
@@ -2668,8 +2705,9 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2668 case CEPH_CAP_OP_IMPORT: 2705 case CEPH_CAP_OP_IMPORT:
2669 handle_cap_import(mdsc, inode, h, session, 2706 handle_cap_import(mdsc, inode, h, session,
2670 snaptrace, le32_to_cpu(h->snap_trace_len)); 2707 snaptrace, le32_to_cpu(h->snap_trace_len));
2671 check_caps = 1; /* we may have sent a RELEASE to the old auth */ 2708 ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY,
2672 goto done; 2709 session);
2710 goto done_unlocked;
2673 } 2711 }
2674 2712
2675 /* the rest require a cap */ 2713 /* the rest require a cap */
@@ -2686,16 +2724,8 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2686 switch (op) { 2724 switch (op) {
2687 case CEPH_CAP_OP_REVOKE: 2725 case CEPH_CAP_OP_REVOKE:
2688 case CEPH_CAP_OP_GRANT: 2726 case CEPH_CAP_OP_GRANT:
2689 r = handle_cap_grant(inode, h, session, cap, msg->middle); 2727 handle_cap_grant(inode, h, session, cap, msg->middle);
2690 if (r == 1) 2728 goto done_unlocked;
2691 ceph_check_caps(ceph_inode(inode),
2692 CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY,
2693 session);
2694 else if (r == 2)
2695 ceph_check_caps(ceph_inode(inode),
2696 CHECK_CAPS_NODELAY,
2697 session);
2698 break;
2699 2729
2700 case CEPH_CAP_OP_FLUSH_ACK: 2730 case CEPH_CAP_OP_FLUSH_ACK:
2701 handle_cap_flush_ack(inode, tid, h, session, cap); 2731 handle_cap_flush_ack(inode, tid, h, session, cap);
@@ -2713,9 +2743,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2713 2743
2714done: 2744done:
2715 mutex_unlock(&session->s_mutex); 2745 mutex_unlock(&session->s_mutex);
2716 2746done_unlocked:
2717 if (check_caps)
2718 ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY, NULL);
2719 if (inode) 2747 if (inode)
2720 iput(inode); 2748 iput(inode);
2721 return; 2749 return;
@@ -2838,11 +2866,18 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
2838 struct ceph_cap *cap; 2866 struct ceph_cap *cap;
2839 struct ceph_mds_request_release *rel = *p; 2867 struct ceph_mds_request_release *rel = *p;
2840 int ret = 0; 2868 int ret = 0;
2841 2869 int used = 0;
2842 dout("encode_inode_release %p mds%d drop %s unless %s\n", inode,
2843 mds, ceph_cap_string(drop), ceph_cap_string(unless));
2844 2870
2845 spin_lock(&inode->i_lock); 2871 spin_lock(&inode->i_lock);
2872 used = __ceph_caps_used(ci);
2873
2874 dout("encode_inode_release %p mds%d used %s drop %s unless %s\n", inode,
2875 mds, ceph_cap_string(used), ceph_cap_string(drop),
2876 ceph_cap_string(unless));
2877
2878 /* only drop unused caps */
2879 drop &= ~used;
2880
2846 cap = __get_cap_for_mds(ci, mds); 2881 cap = __get_cap_for_mds(ci, mds);
2847 if (cap && __cap_is_valid(cap)) { 2882 if (cap && __cap_is_valid(cap)) {
2848 if (force || 2883 if (force ||
diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
index 0c2241ef3653..2fa992eaf7da 100644
--- a/fs/ceph/ceph_fs.h
+++ b/fs/ceph/ceph_fs.h
@@ -19,7 +19,7 @@
19 * Ceph release version 19 * Ceph release version
20 */ 20 */
21#define CEPH_VERSION_MAJOR 0 21#define CEPH_VERSION_MAJOR 0
22#define CEPH_VERSION_MINOR 19 22#define CEPH_VERSION_MINOR 20
23#define CEPH_VERSION_PATCH 0 23#define CEPH_VERSION_PATCH 0
24 24
25#define _CEPH_STRINGIFY(x) #x 25#define _CEPH_STRINGIFY(x) #x
@@ -36,7 +36,7 @@
36 * client-facing protocol. 36 * client-facing protocol.
37 */ 37 */
38#define CEPH_OSD_PROTOCOL 8 /* cluster internal */ 38#define CEPH_OSD_PROTOCOL 8 /* cluster internal */
39#define CEPH_MDS_PROTOCOL 9 /* cluster internal */ 39#define CEPH_MDS_PROTOCOL 12 /* cluster internal */
40#define CEPH_MON_PROTOCOL 5 /* cluster internal */ 40#define CEPH_MON_PROTOCOL 5 /* cluster internal */
41#define CEPH_OSDC_PROTOCOL 24 /* server/client */ 41#define CEPH_OSDC_PROTOCOL 24 /* server/client */
42#define CEPH_MDSC_PROTOCOL 32 /* server/client */ 42#define CEPH_MDSC_PROTOCOL 32 /* server/client */
@@ -53,8 +53,18 @@
53/* 53/*
54 * feature bits 54 * feature bits
55 */ 55 */
56#define CEPH_FEATURE_SUPPORTED 0 56#define CEPH_FEATURE_UID 1
57#define CEPH_FEATURE_REQUIRED 0 57#define CEPH_FEATURE_NOSRCADDR 2
58#define CEPH_FEATURE_FLOCK 4
59
60#define CEPH_FEATURE_SUPPORTED_MON CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR
61#define CEPH_FEATURE_REQUIRED_MON CEPH_FEATURE_UID
62#define CEPH_FEATURE_SUPPORTED_MDS CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR|CEPH_FEATURE_FLOCK
63#define CEPH_FEATURE_REQUIRED_MDS CEPH_FEATURE_UID
64#define CEPH_FEATURE_SUPPORTED_OSD CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR
65#define CEPH_FEATURE_REQUIRED_OSD CEPH_FEATURE_UID
66#define CEPH_FEATURE_SUPPORTED_CLIENT CEPH_FEATURE_NOSRCADDR
67#define CEPH_FEATURE_REQUIRED_CLIENT CEPH_FEATURE_NOSRCADDR
58 68
59 69
60/* 70/*
@@ -91,6 +101,8 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
91#define CEPH_AUTH_NONE 0x1 101#define CEPH_AUTH_NONE 0x1
92#define CEPH_AUTH_CEPHX 0x2 102#define CEPH_AUTH_CEPHX 0x2
93 103
104#define CEPH_AUTH_UID_DEFAULT ((__u64) -1)
105
94 106
95/********************************************* 107/*********************************************
96 * message layer 108 * message layer
@@ -128,11 +140,27 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
128#define CEPH_MSG_CLIENT_SNAP 0x312 140#define CEPH_MSG_CLIENT_SNAP 0x312
129#define CEPH_MSG_CLIENT_CAPRELEASE 0x313 141#define CEPH_MSG_CLIENT_CAPRELEASE 0x313
130 142
143/* pool ops */
144#define CEPH_MSG_POOLOP_REPLY 48
145#define CEPH_MSG_POOLOP 49
146
147
131/* osd */ 148/* osd */
132#define CEPH_MSG_OSD_MAP 41 149#define CEPH_MSG_OSD_MAP 41
133#define CEPH_MSG_OSD_OP 42 150#define CEPH_MSG_OSD_OP 42
134#define CEPH_MSG_OSD_OPREPLY 43 151#define CEPH_MSG_OSD_OPREPLY 43
135 152
153/* pool operations */
154enum {
155 POOL_OP_CREATE = 0x01,
156 POOL_OP_DELETE = 0x02,
157 POOL_OP_AUID_CHANGE = 0x03,
158 POOL_OP_CREATE_SNAP = 0x11,
159 POOL_OP_DELETE_SNAP = 0x12,
160 POOL_OP_CREATE_UNMANAGED_SNAP = 0x21,
161 POOL_OP_DELETE_UNMANAGED_SNAP = 0x22,
162};
163
136struct ceph_mon_request_header { 164struct ceph_mon_request_header {
137 __le64 have_version; 165 __le64 have_version;
138 __le16 session_mon; 166 __le16 session_mon;
@@ -155,6 +183,31 @@ struct ceph_mon_statfs_reply {
155 struct ceph_statfs st; 183 struct ceph_statfs st;
156} __attribute__ ((packed)); 184} __attribute__ ((packed));
157 185
186const char *ceph_pool_op_name(int op);
187
188struct ceph_mon_poolop {
189 struct ceph_mon_request_header monhdr;
190 struct ceph_fsid fsid;
191 __le32 pool;
192 __le32 op;
193 __le64 auid;
194 __le64 snapid;
195 __le32 name_len;
196} __attribute__ ((packed));
197
198struct ceph_mon_poolop_reply {
199 struct ceph_mon_request_header monhdr;
200 struct ceph_fsid fsid;
201 __le32 reply_code;
202 __le32 epoch;
203 char has_data;
204 char data[0];
205} __attribute__ ((packed));
206
207struct ceph_mon_unmanaged_snap {
208 __le64 snapid;
209} __attribute__ ((packed));
210
158struct ceph_osd_getmap { 211struct ceph_osd_getmap {
159 struct ceph_mon_request_header monhdr; 212 struct ceph_mon_request_header monhdr;
160 struct ceph_fsid fsid; 213 struct ceph_fsid fsid;
@@ -212,16 +265,17 @@ extern const char *ceph_mds_state_name(int s);
212 * - they also define the lock ordering by the MDS 265 * - they also define the lock ordering by the MDS
213 * - a few of these are internal to the mds 266 * - a few of these are internal to the mds
214 */ 267 */
215#define CEPH_LOCK_DN 1 268#define CEPH_LOCK_DVERSION 1
216#define CEPH_LOCK_ISNAP 2 269#define CEPH_LOCK_DN 2
217#define CEPH_LOCK_IVERSION 4 /* mds internal */ 270#define CEPH_LOCK_ISNAP 16
218#define CEPH_LOCK_IFILE 8 /* mds internal */ 271#define CEPH_LOCK_IVERSION 32 /* mds internal */
219#define CEPH_LOCK_IAUTH 32 272#define CEPH_LOCK_IFILE 64
220#define CEPH_LOCK_ILINK 64 273#define CEPH_LOCK_IAUTH 128
221#define CEPH_LOCK_IDFT 128 /* dir frag tree */ 274#define CEPH_LOCK_ILINK 256
222#define CEPH_LOCK_INEST 256 /* mds internal */ 275#define CEPH_LOCK_IDFT 512 /* dir frag tree */
223#define CEPH_LOCK_IXATTR 512 276#define CEPH_LOCK_INEST 1024 /* mds internal */
224#define CEPH_LOCK_INO 2048 /* immutable inode bits; not a lock */ 277#define CEPH_LOCK_IXATTR 2048
278#define CEPH_LOCK_INO 8192 /* immutable inode bits; not a lock */
225 279
226/* client_session ops */ 280/* client_session ops */
227enum { 281enum {
@@ -308,6 +362,7 @@ union ceph_mds_request_args {
308 struct { 362 struct {
309 __le32 frag; /* which dir fragment */ 363 __le32 frag; /* which dir fragment */
310 __le32 max_entries; /* how many dentries to grab */ 364 __le32 max_entries; /* how many dentries to grab */
365 __le32 max_bytes;
311 } __attribute__ ((packed)) readdir; 366 } __attribute__ ((packed)) readdir;
312 struct { 367 struct {
313 __le32 mode; 368 __le32 mode;
diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/ceph_strings.c
index 8e4be6a80c62..7503aee828ce 100644
--- a/fs/ceph/ceph_strings.c
+++ b/fs/ceph/ceph_strings.c
@@ -10,7 +10,6 @@ const char *ceph_entity_type_name(int type)
10 case CEPH_ENTITY_TYPE_OSD: return "osd"; 10 case CEPH_ENTITY_TYPE_OSD: return "osd";
11 case CEPH_ENTITY_TYPE_MON: return "mon"; 11 case CEPH_ENTITY_TYPE_MON: return "mon";
12 case CEPH_ENTITY_TYPE_CLIENT: return "client"; 12 case CEPH_ENTITY_TYPE_CLIENT: return "client";
13 case CEPH_ENTITY_TYPE_ADMIN: return "admin";
14 case CEPH_ENTITY_TYPE_AUTH: return "auth"; 13 case CEPH_ENTITY_TYPE_AUTH: return "auth";
15 default: return "unknown"; 14 default: return "unknown";
16 } 15 }
@@ -45,6 +44,7 @@ const char *ceph_osd_op_name(int op)
45 case CEPH_OSD_OP_SETXATTRS: return "setxattrs"; 44 case CEPH_OSD_OP_SETXATTRS: return "setxattrs";
46 case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs"; 45 case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs";
47 case CEPH_OSD_OP_RMXATTR: return "rmxattr"; 46 case CEPH_OSD_OP_RMXATTR: return "rmxattr";
47 case CEPH_OSD_OP_CMPXATTR: return "cmpxattr";
48 48
49 case CEPH_OSD_OP_PULL: return "pull"; 49 case CEPH_OSD_OP_PULL: return "pull";
50 case CEPH_OSD_OP_PUSH: return "push"; 50 case CEPH_OSD_OP_PUSH: return "push";
@@ -174,3 +174,17 @@ const char *ceph_snap_op_name(int o)
174 } 174 }
175 return "???"; 175 return "???";
176} 176}
177
178const char *ceph_pool_op_name(int op)
179{
180 switch (op) {
181 case POOL_OP_CREATE: return "create";
182 case POOL_OP_DELETE: return "delete";
183 case POOL_OP_AUID_CHANGE: return "auid change";
184 case POOL_OP_CREATE_SNAP: return "create snap";
185 case POOL_OP_DELETE_SNAP: return "delete snap";
186 case POOL_OP_CREATE_UNMANAGED_SNAP: return "create unmanaged snap";
187 case POOL_OP_DELETE_UNMANAGED_SNAP: return "delete unmanaged snap";
188 }
189 return "???";
190}
diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c
index 291ac288e791..f704b3b62424 100644
--- a/fs/ceph/crypto.c
+++ b/fs/ceph/crypto.c
@@ -3,6 +3,7 @@
3 3
4#include <linux/err.h> 4#include <linux/err.h>
5#include <linux/scatterlist.h> 5#include <linux/scatterlist.h>
6#include <linux/slab.h>
6#include <crypto/hash.h> 7#include <crypto/hash.h>
7 8
8#include "crypto.h" 9#include "crypto.h"
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index e159f1415110..3be33fb066cc 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -1,6 +1,7 @@
1#include "ceph_debug.h" 1#include "ceph_debug.h"
2 2
3#include <linux/device.h> 3#include <linux/device.h>
4#include <linux/slab.h>
4#include <linux/module.h> 5#include <linux/module.h>
5#include <linux/ctype.h> 6#include <linux/ctype.h>
6#include <linux/debugfs.h> 7#include <linux/debugfs.h>
@@ -112,7 +113,7 @@ static int osdmap_show(struct seq_file *s, void *p)
112static int monc_show(struct seq_file *s, void *p) 113static int monc_show(struct seq_file *s, void *p)
113{ 114{
114 struct ceph_client *client = s->private; 115 struct ceph_client *client = s->private;
115 struct ceph_mon_statfs_request *req; 116 struct ceph_mon_generic_request *req;
116 struct ceph_mon_client *monc = &client->monc; 117 struct ceph_mon_client *monc = &client->monc;
117 struct rb_node *rp; 118 struct rb_node *rp;
118 119
@@ -125,9 +126,14 @@ static int monc_show(struct seq_file *s, void *p)
125 if (monc->want_next_osdmap) 126 if (monc->want_next_osdmap)
126 seq_printf(s, "want next osdmap\n"); 127 seq_printf(s, "want next osdmap\n");
127 128
128 for (rp = rb_first(&monc->statfs_request_tree); rp; rp = rb_next(rp)) { 129 for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) {
129 req = rb_entry(rp, struct ceph_mon_statfs_request, node); 130 __u16 op;
130 seq_printf(s, "%lld statfs\n", req->tid); 131 req = rb_entry(rp, struct ceph_mon_generic_request, node);
132 op = le16_to_cpu(req->request->hdr.type);
133 if (op == CEPH_MSG_STATFS)
134 seq_printf(s, "%lld statfs\n", req->tid);
135 else
136 seq_printf(s, "%lld unknown\n", req->tid);
131 } 137 }
132 138
133 mutex_unlock(&monc->mutex); 139 mutex_unlock(&monc->mutex);
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 5107384ee029..f85719310db2 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -3,6 +3,7 @@
3#include <linux/spinlock.h> 3#include <linux/spinlock.h>
4#include <linux/fs_struct.h> 4#include <linux/fs_struct.h>
5#include <linux/namei.h> 5#include <linux/namei.h>
6#include <linux/slab.h>
6#include <linux/sched.h> 7#include <linux/sched.h>
7 8
8#include "super.h" 9#include "super.h"
@@ -50,8 +51,11 @@ int ceph_init_dentry(struct dentry *dentry)
50 return -ENOMEM; /* oh well */ 51 return -ENOMEM; /* oh well */
51 52
52 spin_lock(&dentry->d_lock); 53 spin_lock(&dentry->d_lock);
53 if (dentry->d_fsdata) /* lost a race */ 54 if (dentry->d_fsdata) {
55 /* lost a race */
56 kmem_cache_free(ceph_dentry_cachep, di);
54 goto out_unlock; 57 goto out_unlock;
58 }
55 di->dentry = dentry; 59 di->dentry = dentry;
56 di->lease_session = NULL; 60 di->lease_session = NULL;
57 dentry->d_fsdata = di; 61 dentry->d_fsdata = di;
@@ -124,7 +128,8 @@ more:
124 dentry = list_entry(p, struct dentry, d_u.d_child); 128 dentry = list_entry(p, struct dentry, d_u.d_child);
125 di = ceph_dentry(dentry); 129 di = ceph_dentry(dentry);
126 while (1) { 130 while (1) {
127 dout(" p %p/%p d_subdirs %p/%p\n", p->prev, p->next, 131 dout(" p %p/%p %s d_subdirs %p/%p\n", p->prev, p->next,
132 d_unhashed(dentry) ? "!hashed" : "hashed",
128 parent->d_subdirs.prev, parent->d_subdirs.next); 133 parent->d_subdirs.prev, parent->d_subdirs.next);
129 if (p == &parent->d_subdirs) { 134 if (p == &parent->d_subdirs) {
130 fi->at_end = 1; 135 fi->at_end = 1;
@@ -170,11 +175,11 @@ more:
170 spin_lock(&inode->i_lock); 175 spin_lock(&inode->i_lock);
171 spin_lock(&dcache_lock); 176 spin_lock(&dcache_lock);
172 177
178 last = dentry;
179
173 if (err < 0) 180 if (err < 0)
174 goto out_unlock; 181 goto out_unlock;
175 182
176 last = dentry;
177
178 p = p->prev; 183 p = p->prev;
179 filp->f_pos++; 184 filp->f_pos++;
180 185
@@ -228,6 +233,7 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
228 u32 ftype; 233 u32 ftype;
229 struct ceph_mds_reply_info_parsed *rinfo; 234 struct ceph_mds_reply_info_parsed *rinfo;
230 const int max_entries = client->mount_args->max_readdir; 235 const int max_entries = client->mount_args->max_readdir;
236 const int max_bytes = client->mount_args->max_readdir_bytes;
231 237
232 dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off); 238 dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
233 if (fi->at_end) 239 if (fi->at_end)
@@ -288,8 +294,10 @@ more:
288 CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR; 294 CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;
289 295
290 /* discard old result, if any */ 296 /* discard old result, if any */
291 if (fi->last_readdir) 297 if (fi->last_readdir) {
292 ceph_mdsc_put_request(fi->last_readdir); 298 ceph_mdsc_put_request(fi->last_readdir);
299 fi->last_readdir = NULL;
300 }
293 301
294 /* requery frag tree, as the frag topology may have changed */ 302 /* requery frag tree, as the frag topology may have changed */
295 frag = ceph_choose_frag(ceph_inode(inode), frag, NULL, NULL); 303 frag = ceph_choose_frag(ceph_inode(inode), frag, NULL, NULL);
@@ -309,7 +317,8 @@ more:
309 req->r_readdir_offset = fi->next_offset; 317 req->r_readdir_offset = fi->next_offset;
310 req->r_args.readdir.frag = cpu_to_le32(frag); 318 req->r_args.readdir.frag = cpu_to_le32(frag);
311 req->r_args.readdir.max_entries = cpu_to_le32(max_entries); 319 req->r_args.readdir.max_entries = cpu_to_le32(max_entries);
312 req->r_num_caps = max_entries; 320 req->r_args.readdir.max_bytes = cpu_to_le32(max_bytes);
321 req->r_num_caps = max_entries + 1;
313 err = ceph_mdsc_do_request(mdsc, NULL, req); 322 err = ceph_mdsc_do_request(mdsc, NULL, req);
314 if (err < 0) { 323 if (err < 0) {
315 ceph_mdsc_put_request(req); 324 ceph_mdsc_put_request(req);
@@ -332,7 +341,7 @@ more:
332 if (req->r_reply_info.dir_end) { 341 if (req->r_reply_info.dir_end) {
333 kfree(fi->last_name); 342 kfree(fi->last_name);
334 fi->last_name = NULL; 343 fi->last_name = NULL;
335 fi->next_offset = 0; 344 fi->next_offset = 2;
336 } else { 345 } else {
337 rinfo = &req->r_reply_info; 346 rinfo = &req->r_reply_info;
338 err = note_last_dentry(fi, 347 err = note_last_dentry(fi,
@@ -475,7 +484,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
475struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, 484struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
476 struct dentry *dentry, int err) 485 struct dentry *dentry, int err)
477{ 486{
478 struct ceph_client *client = ceph_client(dentry->d_sb); 487 struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
479 struct inode *parent = dentry->d_parent->d_inode; 488 struct inode *parent = dentry->d_parent->d_inode;
480 489
481 /* .snap dir? */ 490 /* .snap dir? */
@@ -486,6 +495,7 @@ struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
486 struct inode *inode = ceph_get_snapdir(parent); 495 struct inode *inode = ceph_get_snapdir(parent);
487 dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n", 496 dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n",
488 dentry, dentry->d_name.len, dentry->d_name.name, inode); 497 dentry, dentry->d_name.len, dentry->d_name.name, inode);
498 BUG_ON(!d_unhashed(dentry));
489 d_add(dentry, inode); 499 d_add(dentry, inode);
490 err = 0; 500 err = 0;
491 } 501 }
@@ -564,7 +574,6 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
564 !is_root_ceph_dentry(dir, dentry) && 574 !is_root_ceph_dentry(dir, dentry) &&
565 (ci->i_ceph_flags & CEPH_I_COMPLETE) && 575 (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
566 (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) { 576 (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
567 di->offset = ci->i_max_offset++;
568 spin_unlock(&dir->i_lock); 577 spin_unlock(&dir->i_lock);
569 dout(" dir %p complete, -ENOENT\n", dir); 578 dout(" dir %p complete, -ENOENT\n", dir);
570 d_add(dentry, NULL); 579 d_add(dentry, NULL);
@@ -578,7 +587,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
578 CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP; 587 CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
579 req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS); 588 req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
580 if (IS_ERR(req)) 589 if (IS_ERR(req))
581 return ERR_PTR(PTR_ERR(req)); 590 return ERR_CAST(req);
582 req->r_dentry = dget(dentry); 591 req->r_dentry = dget(dentry);
583 req->r_num_caps = 2; 592 req->r_num_caps = 2;
584 /* we only need inode linkage */ 593 /* we only need inode linkage */
@@ -876,12 +885,30 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
876 * do_request, above). If there is no trace, we need 885 * do_request, above). If there is no trace, we need
877 * to do it here. 886 * to do it here.
878 */ 887 */
888
889 /* d_move screws up d_subdirs order */
890 ceph_i_clear(new_dir, CEPH_I_COMPLETE);
891
879 d_move(old_dentry, new_dentry); 892 d_move(old_dentry, new_dentry);
893
894 /* ensure target dentry is invalidated, despite
895 rehashing bug in vfs_rename_dir */
896 ceph_invalidate_dentry_lease(new_dentry);
880 } 897 }
881 ceph_mdsc_put_request(req); 898 ceph_mdsc_put_request(req);
882 return err; 899 return err;
883} 900}
884 901
902/*
903 * Ensure a dentry lease will no longer revalidate.
904 */
905void ceph_invalidate_dentry_lease(struct dentry *dentry)
906{
907 spin_lock(&dentry->d_lock);
908 dentry->d_time = jiffies;
909 ceph_dentry(dentry)->lease_shared_gen = 0;
910 spin_unlock(&dentry->d_lock);
911}
885 912
886/* 913/*
887 * Check if dentry lease is valid. If not, delete the lease. Try to 914 * Check if dentry lease is valid. If not, delete the lease. Try to
@@ -959,8 +986,9 @@ static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd)
959{ 986{
960 struct inode *dir = dentry->d_parent->d_inode; 987 struct inode *dir = dentry->d_parent->d_inode;
961 988
962 dout("d_revalidate %p '%.*s' inode %p\n", dentry, 989 dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry,
963 dentry->d_name.len, dentry->d_name.name, dentry->d_inode); 990 dentry->d_name.len, dentry->d_name.name, dentry->d_inode,
991 ceph_dentry(dentry)->offset);
964 992
965 /* always trust cached snapped dentries, snapdir dentry */ 993 /* always trust cached snapped dentries, snapdir dentry */
966 if (ceph_snap(dir) != CEPH_NOSNAP) { 994 if (ceph_snap(dir) != CEPH_NOSNAP) {
@@ -1037,7 +1065,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
1037 struct ceph_inode_info *ci = ceph_inode(inode); 1065 struct ceph_inode_info *ci = ceph_inode(inode);
1038 int left; 1066 int left;
1039 1067
1040 if (!ceph_test_opt(ceph_client(inode->i_sb), DIRSTAT)) 1068 if (!ceph_test_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
1041 return -EISDIR; 1069 return -EISDIR;
1042 1070
1043 if (!cf->dir_info) { 1071 if (!cf->dir_info) {
@@ -1079,10 +1107,9 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
1079 * an fsync() on a dir will wait for any uncommitted directory 1107 * an fsync() on a dir will wait for any uncommitted directory
1080 * operations to commit. 1108 * operations to commit.
1081 */ 1109 */
1082static int ceph_dir_fsync(struct file *file, struct dentry *dentry, 1110static int ceph_dir_fsync(struct file *file, int datasync)
1083 int datasync)
1084{ 1111{
1085 struct inode *inode = dentry->d_inode; 1112 struct inode *inode = file->f_path.dentry->d_inode;
1086 struct ceph_inode_info *ci = ceph_inode(inode); 1113 struct ceph_inode_info *ci = ceph_inode(inode);
1087 struct list_head *head = &ci->i_unsafe_dirops; 1114 struct list_head *head = &ci->i_unsafe_dirops;
1088 struct ceph_mds_request *req; 1115 struct ceph_mds_request *req;
@@ -1139,7 +1166,7 @@ void ceph_dentry_lru_add(struct dentry *dn)
1139 dout("dentry_lru_add %p %p '%.*s'\n", di, dn, 1166 dout("dentry_lru_add %p %p '%.*s'\n", di, dn,
1140 dn->d_name.len, dn->d_name.name); 1167 dn->d_name.len, dn->d_name.name);
1141 if (di) { 1168 if (di) {
1142 mdsc = &ceph_client(dn->d_sb)->mdsc; 1169 mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
1143 spin_lock(&mdsc->dentry_lru_lock); 1170 spin_lock(&mdsc->dentry_lru_lock);
1144 list_add_tail(&di->lru, &mdsc->dentry_lru); 1171 list_add_tail(&di->lru, &mdsc->dentry_lru);
1145 mdsc->num_dentry++; 1172 mdsc->num_dentry++;
@@ -1152,10 +1179,10 @@ void ceph_dentry_lru_touch(struct dentry *dn)
1152 struct ceph_dentry_info *di = ceph_dentry(dn); 1179 struct ceph_dentry_info *di = ceph_dentry(dn);
1153 struct ceph_mds_client *mdsc; 1180 struct ceph_mds_client *mdsc;
1154 1181
1155 dout("dentry_lru_touch %p %p '%.*s'\n", di, dn, 1182 dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn,
1156 dn->d_name.len, dn->d_name.name); 1183 dn->d_name.len, dn->d_name.name, di->offset);
1157 if (di) { 1184 if (di) {
1158 mdsc = &ceph_client(dn->d_sb)->mdsc; 1185 mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
1159 spin_lock(&mdsc->dentry_lru_lock); 1186 spin_lock(&mdsc->dentry_lru_lock);
1160 list_move_tail(&di->lru, &mdsc->dentry_lru); 1187 list_move_tail(&di->lru, &mdsc->dentry_lru);
1161 spin_unlock(&mdsc->dentry_lru_lock); 1188 spin_unlock(&mdsc->dentry_lru_lock);
@@ -1170,7 +1197,7 @@ void ceph_dentry_lru_del(struct dentry *dn)
1170 dout("dentry_lru_del %p %p '%.*s'\n", di, dn, 1197 dout("dentry_lru_del %p %p '%.*s'\n", di, dn,
1171 dn->d_name.len, dn->d_name.name); 1198 dn->d_name.len, dn->d_name.name);
1172 if (di) { 1199 if (di) {
1173 mdsc = &ceph_client(dn->d_sb)->mdsc; 1200 mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
1174 spin_lock(&mdsc->dentry_lru_lock); 1201 spin_lock(&mdsc->dentry_lru_lock);
1175 list_del_init(&di->lru); 1202 list_del_init(&di->lru);
1176 mdsc->num_dentry--; 1203 mdsc->num_dentry--;
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index fc68e39cbad6..4480cb1c63e7 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -1,6 +1,7 @@
1#include "ceph_debug.h" 1#include "ceph_debug.h"
2 2
3#include <linux/exportfs.h> 3#include <linux/exportfs.h>
4#include <linux/slab.h>
4#include <asm/unaligned.h> 5#include <asm/unaligned.h>
5 6
6#include "super.h" 7#include "super.h"
@@ -92,11 +93,11 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
92 return ERR_PTR(-ESTALE); 93 return ERR_PTR(-ESTALE);
93 94
94 dentry = d_obtain_alias(inode); 95 dentry = d_obtain_alias(inode);
95 if (!dentry) { 96 if (IS_ERR(dentry)) {
96 pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n", 97 pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n",
97 fh->ino, inode); 98 fh->ino, inode);
98 iput(inode); 99 iput(inode);
99 return ERR_PTR(-ENOMEM); 100 return dentry;
100 } 101 }
101 err = ceph_init_dentry(dentry); 102 err = ceph_init_dentry(dentry);
102 103
@@ -114,7 +115,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
114static struct dentry *__cfh_to_dentry(struct super_block *sb, 115static struct dentry *__cfh_to_dentry(struct super_block *sb,
115 struct ceph_nfs_confh *cfh) 116 struct ceph_nfs_confh *cfh)
116{ 117{
117 struct ceph_mds_client *mdsc = &ceph_client(sb)->mdsc; 118 struct ceph_mds_client *mdsc = &ceph_sb_to_client(sb)->mdsc;
118 struct inode *inode; 119 struct inode *inode;
119 struct dentry *dentry; 120 struct dentry *dentry;
120 struct ceph_vino vino; 121 struct ceph_vino vino;
@@ -132,7 +133,7 @@ static struct dentry *__cfh_to_dentry(struct super_block *sb,
132 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPHASH, 133 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPHASH,
133 USE_ANY_MDS); 134 USE_ANY_MDS);
134 if (IS_ERR(req)) 135 if (IS_ERR(req))
135 return ERR_PTR(PTR_ERR(req)); 136 return ERR_CAST(req);
136 137
137 req->r_ino1 = vino; 138 req->r_ino1 = vino;
138 req->r_ino2.ino = cfh->parent_ino; 139 req->r_ino2.ino = cfh->parent_ino;
@@ -148,11 +149,11 @@ static struct dentry *__cfh_to_dentry(struct super_block *sb,
148 } 149 }
149 150
150 dentry = d_obtain_alias(inode); 151 dentry = d_obtain_alias(inode);
151 if (!dentry) { 152 if (IS_ERR(dentry)) {
152 pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n", 153 pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n",
153 cfh->ino, inode); 154 cfh->ino, inode);
154 iput(inode); 155 iput(inode);
155 return ERR_PTR(-ENOMEM); 156 return dentry;
156 } 157 }
157 err = ceph_init_dentry(dentry); 158 err = ceph_init_dentry(dentry);
158 if (err < 0) { 159 if (err < 0) {
@@ -201,11 +202,11 @@ static struct dentry *ceph_fh_to_parent(struct super_block *sb,
201 return ERR_PTR(-ESTALE); 202 return ERR_PTR(-ESTALE);
202 203
203 dentry = d_obtain_alias(inode); 204 dentry = d_obtain_alias(inode);
204 if (!dentry) { 205 if (IS_ERR(dentry)) {
205 pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n", 206 pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n",
206 cfh->ino, inode); 207 cfh->ino, inode);
207 iput(inode); 208 iput(inode);
208 return ERR_PTR(-ENOMEM); 209 return dentry;
209 } 210 }
210 err = ceph_init_dentry(dentry); 211 err = ceph_init_dentry(dentry);
211 if (err < 0) { 212 if (err < 0) {
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 5d2af8464f6a..6251a1574b94 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1,6 +1,7 @@
1#include "ceph_debug.h" 1#include "ceph_debug.h"
2 2
3#include <linux/sched.h> 3#include <linux/sched.h>
4#include <linux/slab.h>
4#include <linux/file.h> 5#include <linux/file.h>
5#include <linux/namei.h> 6#include <linux/namei.h>
6#include <linux/writeback.h> 7#include <linux/writeback.h>
@@ -229,7 +230,7 @@ struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
229 /* do the open */ 230 /* do the open */
230 req = prepare_open_request(dir->i_sb, flags, mode); 231 req = prepare_open_request(dir->i_sb, flags, mode);
231 if (IS_ERR(req)) 232 if (IS_ERR(req))
232 return ERR_PTR(PTR_ERR(req)); 233 return ERR_CAST(req);
233 req->r_dentry = dget(dentry); 234 req->r_dentry = dget(dentry);
234 req->r_num_caps = 2; 235 req->r_num_caps = 2;
235 if (flags & O_CREAT) { 236 if (flags & O_CREAT) {
@@ -316,16 +317,16 @@ void ceph_release_page_vector(struct page **pages, int num_pages)
316/* 317/*
317 * allocate a vector new pages 318 * allocate a vector new pages
318 */ 319 */
319static struct page **alloc_page_vector(int num_pages) 320struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags)
320{ 321{
321 struct page **pages; 322 struct page **pages;
322 int i; 323 int i;
323 324
324 pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS); 325 pages = kmalloc(sizeof(*pages) * num_pages, flags);
325 if (!pages) 326 if (!pages)
326 return ERR_PTR(-ENOMEM); 327 return ERR_PTR(-ENOMEM);
327 for (i = 0; i < num_pages; i++) { 328 for (i = 0; i < num_pages; i++) {
328 pages[i] = alloc_page(GFP_NOFS); 329 pages[i] = __page_cache_alloc(flags);
329 if (pages[i] == NULL) { 330 if (pages[i] == NULL) {
330 ceph_release_page_vector(pages, i); 331 ceph_release_page_vector(pages, i);
331 return ERR_PTR(-ENOMEM); 332 return ERR_PTR(-ENOMEM);
@@ -539,7 +540,7 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data,
539 * in sequence. 540 * in sequence.
540 */ 541 */
541 } else { 542 } else {
542 pages = alloc_page_vector(num_pages); 543 pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
543 } 544 }
544 if (IS_ERR(pages)) 545 if (IS_ERR(pages))
545 return PTR_ERR(pages); 546 return PTR_ERR(pages);
@@ -648,8 +649,8 @@ more:
648 do_sync, 649 do_sync,
649 ci->i_truncate_seq, ci->i_truncate_size, 650 ci->i_truncate_seq, ci->i_truncate_size,
650 &mtime, false, 2); 651 &mtime, false, 2);
651 if (IS_ERR(req)) 652 if (!req)
652 return PTR_ERR(req); 653 return -ENOMEM;
653 654
654 num_pages = calc_pages_for(pos, len); 655 num_pages = calc_pages_for(pos, len);
655 656
@@ -664,9 +665,10 @@ more:
664 * throw out any page cache pages in this range. this 665 * throw out any page cache pages in this range. this
665 * may block. 666 * may block.
666 */ 667 */
667 truncate_inode_pages_range(inode->i_mapping, pos, pos+len); 668 truncate_inode_pages_range(inode->i_mapping, pos,
669 (pos+len) | (PAGE_CACHE_SIZE-1));
668 } else { 670 } else {
669 pages = alloc_page_vector(num_pages); 671 pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
670 if (IS_ERR(pages)) { 672 if (IS_ERR(pages)) {
671 ret = PTR_ERR(pages); 673 ret = PTR_ERR(pages);
672 goto out; 674 goto out;
@@ -807,7 +809,7 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
807 struct file *file = iocb->ki_filp; 809 struct file *file = iocb->ki_filp;
808 struct inode *inode = file->f_dentry->d_inode; 810 struct inode *inode = file->f_dentry->d_inode;
809 struct ceph_inode_info *ci = ceph_inode(inode); 811 struct ceph_inode_info *ci = ceph_inode(inode);
810 struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc; 812 struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc;
811 loff_t endoff = pos + iov->iov_len; 813 loff_t endoff = pos + iov->iov_len;
812 int got = 0; 814 int got = 0;
813 int ret, err; 815 int ret, err;
@@ -842,8 +844,7 @@ retry_snap:
842 if ((ret >= 0 || ret == -EIOCBQUEUED) && 844 if ((ret >= 0 || ret == -EIOCBQUEUED) &&
843 ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host) 845 ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host)
844 || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) { 846 || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) {
845 err = vfs_fsync_range(file, file->f_path.dentry, 847 err = vfs_fsync_range(file, pos, pos + ret - 1, 1);
846 pos, pos + ret - 1, 1);
847 if (err < 0) 848 if (err < 0)
848 ret = err; 849 ret = err;
849 } 850 }
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 7abe1aed819b..226f5a50d362 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -69,7 +69,7 @@ struct inode *ceph_get_snapdir(struct inode *parent)
69 69
70 BUG_ON(!S_ISDIR(parent->i_mode)); 70 BUG_ON(!S_ISDIR(parent->i_mode));
71 if (IS_ERR(inode)) 71 if (IS_ERR(inode))
72 return ERR_PTR(PTR_ERR(inode)); 72 return inode;
73 inode->i_mode = parent->i_mode; 73 inode->i_mode = parent->i_mode;
74 inode->i_uid = parent->i_uid; 74 inode->i_uid = parent->i_uid;
75 inode->i_gid = parent->i_gid; 75 inode->i_gid = parent->i_gid;
@@ -378,6 +378,22 @@ void ceph_destroy_inode(struct inode *inode)
378 378
379 ceph_queue_caps_release(inode); 379 ceph_queue_caps_release(inode);
380 380
381 /*
382 * we may still have a snap_realm reference if there are stray
383 * caps in i_cap_exporting_issued or i_snap_caps.
384 */
385 if (ci->i_snap_realm) {
386 struct ceph_mds_client *mdsc =
387 &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
388 struct ceph_snap_realm *realm = ci->i_snap_realm;
389
390 dout(" dropping residual ref to snap realm %p\n", realm);
391 spin_lock(&realm->inodes_with_caps_lock);
392 list_del_init(&ci->i_snap_realm_item);
393 spin_unlock(&realm->inodes_with_caps_lock);
394 ceph_put_snap_realm(mdsc, realm);
395 }
396
381 kfree(ci->i_symlink); 397 kfree(ci->i_symlink);
382 while ((n = rb_first(&ci->i_fragtree)) != NULL) { 398 while ((n = rb_first(&ci->i_fragtree)) != NULL) {
383 frag = rb_entry(n, struct ceph_inode_frag, node); 399 frag = rb_entry(n, struct ceph_inode_frag, node);
@@ -603,11 +619,12 @@ static int fill_inode(struct inode *inode,
603 memcpy(ci->i_xattrs.blob->vec.iov_base, 619 memcpy(ci->i_xattrs.blob->vec.iov_base,
604 iinfo->xattr_data, iinfo->xattr_len); 620 iinfo->xattr_data, iinfo->xattr_len);
605 ci->i_xattrs.version = le64_to_cpu(info->xattr_version); 621 ci->i_xattrs.version = le64_to_cpu(info->xattr_version);
622 xattr_blob = NULL;
606 } 623 }
607 624
608 inode->i_mapping->a_ops = &ceph_aops; 625 inode->i_mapping->a_ops = &ceph_aops;
609 inode->i_mapping->backing_dev_info = 626 inode->i_mapping->backing_dev_info =
610 &ceph_client(inode->i_sb)->backing_dev_info; 627 &ceph_sb_to_client(inode->i_sb)->backing_dev_info;
611 628
612 switch (inode->i_mode & S_IFMT) { 629 switch (inode->i_mode & S_IFMT) {
613 case S_IFIFO: 630 case S_IFIFO:
@@ -658,14 +675,15 @@ static int fill_inode(struct inode *inode,
658 /* set dir completion flag? */ 675 /* set dir completion flag? */
659 if (ci->i_files == 0 && ci->i_subdirs == 0 && 676 if (ci->i_files == 0 && ci->i_subdirs == 0 &&
660 ceph_snap(inode) == CEPH_NOSNAP && 677 ceph_snap(inode) == CEPH_NOSNAP &&
661 (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED)) { 678 (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) &&
679 (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
662 dout(" marking %p complete (empty)\n", inode); 680 dout(" marking %p complete (empty)\n", inode);
663 ci->i_ceph_flags |= CEPH_I_COMPLETE; 681 ci->i_ceph_flags |= CEPH_I_COMPLETE;
664 ci->i_max_offset = 2; 682 ci->i_max_offset = 2;
665 } 683 }
666 684
667 /* it may be better to set st_size in getattr instead? */ 685 /* it may be better to set st_size in getattr instead? */
668 if (ceph_test_opt(ceph_client(inode->i_sb), RBYTES)) 686 if (ceph_test_opt(ceph_sb_to_client(inode->i_sb), RBYTES))
669 inode->i_size = ci->i_rbytes; 687 inode->i_size = ci->i_rbytes;
670 break; 688 break;
671 default: 689 default:
@@ -717,6 +735,10 @@ no_change:
717 __ceph_get_fmode(ci, cap_fmode); 735 __ceph_get_fmode(ci, cap_fmode);
718 spin_unlock(&inode->i_lock); 736 spin_unlock(&inode->i_lock);
719 } 737 }
738 } else if (cap_fmode >= 0) {
739 pr_warning("mds issued no caps on %llx.%llx\n",
740 ceph_vinop(inode));
741 __ceph_get_fmode(ci, cap_fmode);
720 } 742 }
721 743
722 /* update delegation info? */ 744 /* update delegation info? */
@@ -782,6 +804,37 @@ out_unlock:
782} 804}
783 805
784/* 806/*
807 * Set dentry's directory position based on the current dir's max, and
808 * order it in d_subdirs, so that dcache_readdir behaves.
809 */
810static void ceph_set_dentry_offset(struct dentry *dn)
811{
812 struct dentry *dir = dn->d_parent;
813 struct inode *inode = dn->d_parent->d_inode;
814 struct ceph_dentry_info *di;
815
816 BUG_ON(!inode);
817
818 di = ceph_dentry(dn);
819
820 spin_lock(&inode->i_lock);
821 if ((ceph_inode(inode)->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
822 spin_unlock(&inode->i_lock);
823 return;
824 }
825 di->offset = ceph_inode(inode)->i_max_offset++;
826 spin_unlock(&inode->i_lock);
827
828 spin_lock(&dcache_lock);
829 spin_lock(&dn->d_lock);
830 list_move_tail(&dir->d_subdirs, &dn->d_u.d_child);
831 dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
832 dn->d_u.d_child.prev, dn->d_u.d_child.next);
833 spin_unlock(&dn->d_lock);
834 spin_unlock(&dcache_lock);
835}
836
837/*
785 * splice a dentry to an inode. 838 * splice a dentry to an inode.
786 * caller must hold directory i_mutex for this to be safe. 839 * caller must hold directory i_mutex for this to be safe.
787 * 840 *
@@ -794,6 +847,8 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
794{ 847{
795 struct dentry *realdn; 848 struct dentry *realdn;
796 849
850 BUG_ON(dn->d_inode);
851
797 /* dn must be unhashed */ 852 /* dn must be unhashed */
798 if (!d_unhashed(dn)) 853 if (!d_unhashed(dn))
799 d_drop(dn); 854 d_drop(dn);
@@ -815,44 +870,17 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
815 dn = realdn; 870 dn = realdn;
816 } else { 871 } else {
817 BUG_ON(!ceph_dentry(dn)); 872 BUG_ON(!ceph_dentry(dn));
818
819 dout("dn %p attached to %p ino %llx.%llx\n", 873 dout("dn %p attached to %p ino %llx.%llx\n",
820 dn, dn->d_inode, ceph_vinop(dn->d_inode)); 874 dn, dn->d_inode, ceph_vinop(dn->d_inode));
821 } 875 }
822 if ((!prehash || *prehash) && d_unhashed(dn)) 876 if ((!prehash || *prehash) && d_unhashed(dn))
823 d_rehash(dn); 877 d_rehash(dn);
878 ceph_set_dentry_offset(dn);
824out: 879out:
825 return dn; 880 return dn;
826} 881}
827 882
828/* 883/*
829 * Set dentry's directory position based on the current dir's max, and
830 * order it in d_subdirs, so that dcache_readdir behaves.
831 */
832static void ceph_set_dentry_offset(struct dentry *dn)
833{
834 struct dentry *dir = dn->d_parent;
835 struct inode *inode = dn->d_parent->d_inode;
836 struct ceph_dentry_info *di;
837
838 BUG_ON(!inode);
839
840 di = ceph_dentry(dn);
841
842 spin_lock(&inode->i_lock);
843 di->offset = ceph_inode(inode)->i_max_offset++;
844 spin_unlock(&inode->i_lock);
845
846 spin_lock(&dcache_lock);
847 spin_lock(&dn->d_lock);
848 list_move_tail(&dir->d_subdirs, &dn->d_u.d_child);
849 dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
850 dn->d_u.d_child.prev, dn->d_u.d_child.next);
851 spin_unlock(&dn->d_lock);
852 spin_unlock(&dcache_lock);
853}
854
855/*
856 * Incorporate results into the local cache. This is either just 884 * Incorporate results into the local cache. This is either just
857 * one inode, or a directory, dentry, and possibly linked-to inode (e.g., 885 * one inode, or a directory, dentry, and possibly linked-to inode (e.g.,
858 * after a lookup). 886 * after a lookup).
@@ -870,6 +898,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
870 struct inode *in = NULL; 898 struct inode *in = NULL;
871 struct ceph_mds_reply_inode *ininfo; 899 struct ceph_mds_reply_inode *ininfo;
872 struct ceph_vino vino; 900 struct ceph_vino vino;
901 struct ceph_client *client = ceph_sb_to_client(sb);
873 int i = 0; 902 int i = 0;
874 int err = 0; 903 int err = 0;
875 904
@@ -912,14 +941,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
912 941
913 if (!rinfo->head->is_target && !rinfo->head->is_dentry) { 942 if (!rinfo->head->is_target && !rinfo->head->is_dentry) {
914 dout("fill_trace reply is empty!\n"); 943 dout("fill_trace reply is empty!\n");
915 if (rinfo->head->result == 0 && req->r_locked_dir) { 944 if (rinfo->head->result == 0 && req->r_locked_dir)
916 struct ceph_inode_info *ci = 945 ceph_invalidate_dir_request(req);
917 ceph_inode(req->r_locked_dir);
918 dout(" clearing %p complete (empty trace)\n",
919 req->r_locked_dir);
920 ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
921 ci->i_release_count++;
922 }
923 return 0; 946 return 0;
924 } 947 }
925 948
@@ -933,7 +956,14 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
933 return err; 956 return err;
934 } 957 }
935 958
936 if (rinfo->head->is_dentry && !req->r_aborted) { 959 /*
960 * ignore null lease/binding on snapdir ENOENT, or else we
961 * will have trouble splicing in the virtual snapdir later
962 */
963 if (rinfo->head->is_dentry && !req->r_aborted &&
964 (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name,
965 client->mount_args->snapdir_name,
966 req->r_dentry->d_name.len))) {
937 /* 967 /*
938 * lookup link rename : null -> possibly existing inode 968 * lookup link rename : null -> possibly existing inode
939 * mknod symlink mkdir : null -> new inode 969 * mknod symlink mkdir : null -> new inode
@@ -973,19 +1003,28 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
973 dn, dn->d_name.len, dn->d_name.name); 1003 dn, dn->d_name.len, dn->d_name.name);
974 dout("fill_trace doing d_move %p -> %p\n", 1004 dout("fill_trace doing d_move %p -> %p\n",
975 req->r_old_dentry, dn); 1005 req->r_old_dentry, dn);
1006
1007 /* d_move screws up d_subdirs order */
1008 ceph_i_clear(dir, CEPH_I_COMPLETE);
1009
976 d_move(req->r_old_dentry, dn); 1010 d_move(req->r_old_dentry, dn);
977 dout(" src %p '%.*s' dst %p '%.*s'\n", 1011 dout(" src %p '%.*s' dst %p '%.*s'\n",
978 req->r_old_dentry, 1012 req->r_old_dentry,
979 req->r_old_dentry->d_name.len, 1013 req->r_old_dentry->d_name.len,
980 req->r_old_dentry->d_name.name, 1014 req->r_old_dentry->d_name.name,
981 dn, dn->d_name.len, dn->d_name.name); 1015 dn, dn->d_name.len, dn->d_name.name);
1016
982 /* ensure target dentry is invalidated, despite 1017 /* ensure target dentry is invalidated, despite
983 rehashing bug in vfs_rename_dir */ 1018 rehashing bug in vfs_rename_dir */
984 dn->d_time = jiffies; 1019 ceph_invalidate_dentry_lease(dn);
985 ceph_dentry(dn)->lease_shared_gen = 0; 1020
986 /* take overwritten dentry's readdir offset */ 1021 /* take overwritten dentry's readdir offset */
1022 dout("dn %p gets %p offset %lld (old offset %lld)\n",
1023 req->r_old_dentry, dn, ceph_dentry(dn)->offset,
1024 ceph_dentry(req->r_old_dentry)->offset);
987 ceph_dentry(req->r_old_dentry)->offset = 1025 ceph_dentry(req->r_old_dentry)->offset =
988 ceph_dentry(dn)->offset; 1026 ceph_dentry(dn)->offset;
1027
989 dn = req->r_old_dentry; /* use old_dentry */ 1028 dn = req->r_old_dentry; /* use old_dentry */
990 in = dn->d_inode; 1029 in = dn->d_inode;
991 } 1030 }
@@ -1027,7 +1066,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
1027 goto done; 1066 goto done;
1028 } 1067 }
1029 req->r_dentry = dn; /* may have spliced */ 1068 req->r_dentry = dn; /* may have spliced */
1030 ceph_set_dentry_offset(dn);
1031 igrab(in); 1069 igrab(in);
1032 } else if (ceph_ino(in) == vino.ino && 1070 } else if (ceph_ino(in) == vino.ino &&
1033 ceph_snap(in) == vino.snap) { 1071 ceph_snap(in) == vino.snap) {
@@ -1070,7 +1108,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
1070 err = PTR_ERR(dn); 1108 err = PTR_ERR(dn);
1071 goto done; 1109 goto done;
1072 } 1110 }
1073 ceph_set_dentry_offset(dn);
1074 req->r_dentry = dn; /* may have spliced */ 1111 req->r_dentry = dn; /* may have spliced */
1075 igrab(in); 1112 igrab(in);
1076 rinfo->head->is_dentry = 1; /* fool notrace handlers */ 1113 rinfo->head->is_dentry = 1; /* fool notrace handlers */
@@ -1397,7 +1434,7 @@ void ceph_queue_vmtruncate(struct inode *inode)
1397{ 1434{
1398 struct ceph_inode_info *ci = ceph_inode(inode); 1435 struct ceph_inode_info *ci = ceph_inode(inode);
1399 1436
1400 if (queue_work(ceph_client(inode->i_sb)->trunc_wq, 1437 if (queue_work(ceph_sb_to_client(inode->i_sb)->trunc_wq,
1401 &ci->i_vmtruncate_work)) { 1438 &ci->i_vmtruncate_work)) {
1402 dout("ceph_queue_vmtruncate %p\n", inode); 1439 dout("ceph_queue_vmtruncate %p\n", inode);
1403 igrab(inode); 1440 igrab(inode);
@@ -1486,7 +1523,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
1486 struct inode *parent_inode = dentry->d_parent->d_inode; 1523 struct inode *parent_inode = dentry->d_parent->d_inode;
1487 const unsigned int ia_valid = attr->ia_valid; 1524 const unsigned int ia_valid = attr->ia_valid;
1488 struct ceph_mds_request *req; 1525 struct ceph_mds_request *req;
1489 struct ceph_mds_client *mdsc = &ceph_client(dentry->d_sb)->mdsc; 1526 struct ceph_mds_client *mdsc = &ceph_sb_to_client(dentry->d_sb)->mdsc;
1490 int issued; 1527 int issued;
1491 int release = 0, dirtied = 0; 1528 int release = 0, dirtied = 0;
1492 int mask = 0; 1529 int mask = 0;
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 8a5bcae62846..d085f07756b4 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -98,7 +98,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
98 struct ceph_ioctl_dataloc dl; 98 struct ceph_ioctl_dataloc dl;
99 struct inode *inode = file->f_dentry->d_inode; 99 struct inode *inode = file->f_dentry->d_inode;
100 struct ceph_inode_info *ci = ceph_inode(inode); 100 struct ceph_inode_info *ci = ceph_inode(inode);
101 struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc; 101 struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc;
102 u64 len = 1, olen; 102 u64 len = 1, olen;
103 u64 tmp; 103 u64 tmp;
104 struct ceph_object_layout ol; 104 struct ceph_object_layout ol;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index a2600101ec22..b49f12822cbc 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1,6 +1,7 @@
1#include "ceph_debug.h" 1#include "ceph_debug.h"
2 2
3#include <linux/wait.h> 3#include <linux/wait.h>
4#include <linux/slab.h>
4#include <linux/sched.h> 5#include <linux/sched.h>
5 6
6#include "mds_client.h" 7#include "mds_client.h"
@@ -39,7 +40,7 @@
39static void __wake_requests(struct ceph_mds_client *mdsc, 40static void __wake_requests(struct ceph_mds_client *mdsc,
40 struct list_head *head); 41 struct list_head *head);
41 42
42const static struct ceph_connection_operations mds_con_ops; 43static const struct ceph_connection_operations mds_con_ops;
43 44
44 45
45/* 46/*
@@ -328,6 +329,8 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
328 struct ceph_mds_session *s; 329 struct ceph_mds_session *s;
329 330
330 s = kzalloc(sizeof(*s), GFP_NOFS); 331 s = kzalloc(sizeof(*s), GFP_NOFS);
332 if (!s)
333 return ERR_PTR(-ENOMEM);
331 s->s_mdsc = mdsc; 334 s->s_mdsc = mdsc;
332 s->s_mds = mds; 335 s->s_mds = mds;
333 s->s_state = CEPH_MDS_SESSION_NEW; 336 s->s_state = CEPH_MDS_SESSION_NEW;
@@ -529,7 +532,7 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
529{ 532{
530 dout("__unregister_request %p tid %lld\n", req, req->r_tid); 533 dout("__unregister_request %p tid %lld\n", req, req->r_tid);
531 rb_erase(&req->r_node, &mdsc->request_tree); 534 rb_erase(&req->r_node, &mdsc->request_tree);
532 ceph_mdsc_put_request(req); 535 RB_CLEAR_NODE(&req->r_node);
533 536
534 if (req->r_unsafe_dir) { 537 if (req->r_unsafe_dir) {
535 struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir); 538 struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
@@ -538,6 +541,8 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
538 list_del_init(&req->r_unsafe_dir_item); 541 list_del_init(&req->r_unsafe_dir_item);
539 spin_unlock(&ci->i_unsafe_lock); 542 spin_unlock(&ci->i_unsafe_lock);
540 } 543 }
544
545 ceph_mdsc_put_request(req);
541} 546}
542 547
543/* 548/*
@@ -660,10 +665,10 @@ static struct ceph_msg *create_session_msg(u32 op, u64 seq)
660 struct ceph_msg *msg; 665 struct ceph_msg *msg;
661 struct ceph_mds_session_head *h; 666 struct ceph_mds_session_head *h;
662 667
663 msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), 0, 0, NULL); 668 msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS);
664 if (IS_ERR(msg)) { 669 if (!msg) {
665 pr_err("create_session_msg ENOMEM creating msg\n"); 670 pr_err("create_session_msg ENOMEM creating msg\n");
666 return ERR_PTR(PTR_ERR(msg)); 671 return NULL;
667 } 672 }
668 h = msg->front.iov_base; 673 h = msg->front.iov_base;
669 h->op = cpu_to_le32(op); 674 h->op = cpu_to_le32(op);
@@ -682,7 +687,6 @@ static int __open_session(struct ceph_mds_client *mdsc,
682 struct ceph_msg *msg; 687 struct ceph_msg *msg;
683 int mstate; 688 int mstate;
684 int mds = session->s_mds; 689 int mds = session->s_mds;
685 int err = 0;
686 690
687 /* wait for mds to go active? */ 691 /* wait for mds to go active? */
688 mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds); 692 mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds);
@@ -693,13 +697,9 @@ static int __open_session(struct ceph_mds_client *mdsc,
693 697
694 /* send connect message */ 698 /* send connect message */
695 msg = create_session_msg(CEPH_SESSION_REQUEST_OPEN, session->s_seq); 699 msg = create_session_msg(CEPH_SESSION_REQUEST_OPEN, session->s_seq);
696 if (IS_ERR(msg)) { 700 if (!msg)
697 err = PTR_ERR(msg); 701 return -ENOMEM;
698 goto out;
699 }
700 ceph_con_send(&session->s_con, msg); 702 ceph_con_send(&session->s_con, msg);
701
702out:
703 return 0; 703 return 0;
704} 704}
705 705
@@ -731,9 +731,10 @@ static void cleanup_cap_releases(struct ceph_mds_session *session)
731} 731}
732 732
733/* 733/*
734 * Helper to safely iterate over all caps associated with a session. 734 * Helper to safely iterate over all caps associated with a session, with
735 * special care taken to handle a racing __ceph_remove_cap().
735 * 736 *
736 * caller must hold session s_mutex 737 * Caller must hold session s_mutex.
737 */ 738 */
738static int iterate_session_caps(struct ceph_mds_session *session, 739static int iterate_session_caps(struct ceph_mds_session *session,
739 int (*cb)(struct inode *, struct ceph_cap *, 740 int (*cb)(struct inode *, struct ceph_cap *,
@@ -798,12 +799,49 @@ out:
798} 799}
799 800
800static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, 801static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
801 void *arg) 802 void *arg)
802{ 803{
803 struct ceph_inode_info *ci = ceph_inode(inode); 804 struct ceph_inode_info *ci = ceph_inode(inode);
805 int drop = 0;
806
804 dout("removing cap %p, ci is %p, inode is %p\n", 807 dout("removing cap %p, ci is %p, inode is %p\n",
805 cap, ci, &ci->vfs_inode); 808 cap, ci, &ci->vfs_inode);
806 ceph_remove_cap(cap); 809 spin_lock(&inode->i_lock);
810 __ceph_remove_cap(cap);
811 if (!__ceph_is_any_real_caps(ci)) {
812 struct ceph_mds_client *mdsc =
813 &ceph_sb_to_client(inode->i_sb)->mdsc;
814
815 spin_lock(&mdsc->cap_dirty_lock);
816 if (!list_empty(&ci->i_dirty_item)) {
817 pr_info(" dropping dirty %s state for %p %lld\n",
818 ceph_cap_string(ci->i_dirty_caps),
819 inode, ceph_ino(inode));
820 ci->i_dirty_caps = 0;
821 list_del_init(&ci->i_dirty_item);
822 drop = 1;
823 }
824 if (!list_empty(&ci->i_flushing_item)) {
825 pr_info(" dropping dirty+flushing %s state for %p %lld\n",
826 ceph_cap_string(ci->i_flushing_caps),
827 inode, ceph_ino(inode));
828 ci->i_flushing_caps = 0;
829 list_del_init(&ci->i_flushing_item);
830 mdsc->num_cap_flushing--;
831 drop = 1;
832 }
833 if (drop && ci->i_wrbuffer_ref) {
834 pr_info(" dropping dirty data for %p %lld\n",
835 inode, ceph_ino(inode));
836 ci->i_wrbuffer_ref = 0;
837 ci->i_wrbuffer_ref_head = 0;
838 drop++;
839 }
840 spin_unlock(&mdsc->cap_dirty_lock);
841 }
842 spin_unlock(&inode->i_lock);
843 while (drop--)
844 iput(inode);
807 return 0; 845 return 0;
808} 846}
809 847
@@ -815,6 +853,7 @@ static void remove_session_caps(struct ceph_mds_session *session)
815 dout("remove_session_caps on %p\n", session); 853 dout("remove_session_caps on %p\n", session);
816 iterate_session_caps(session, remove_session_caps_cb, NULL); 854 iterate_session_caps(session, remove_session_caps_cb, NULL);
817 BUG_ON(session->s_nr_caps > 0); 855 BUG_ON(session->s_nr_caps > 0);
856 BUG_ON(!list_empty(&session->s_cap_flushing));
818 cleanup_cap_releases(session); 857 cleanup_cap_releases(session);
819} 858}
820 859
@@ -862,6 +901,7 @@ static int send_renew_caps(struct ceph_mds_client *mdsc,
862 if (time_after_eq(jiffies, session->s_cap_ttl) && 901 if (time_after_eq(jiffies, session->s_cap_ttl) &&
863 time_after_eq(session->s_cap_ttl, session->s_renew_requested)) 902 time_after_eq(session->s_cap_ttl, session->s_renew_requested))
864 pr_info("mds%d caps stale\n", session->s_mds); 903 pr_info("mds%d caps stale\n", session->s_mds);
904 session->s_renew_requested = jiffies;
865 905
866 /* do not try to renew caps until a recovering mds has reconnected 906 /* do not try to renew caps until a recovering mds has reconnected
867 * with its clients. */ 907 * with its clients. */
@@ -874,11 +914,10 @@ static int send_renew_caps(struct ceph_mds_client *mdsc,
874 914
875 dout("send_renew_caps to mds%d (%s)\n", session->s_mds, 915 dout("send_renew_caps to mds%d (%s)\n", session->s_mds,
876 ceph_mds_state_name(state)); 916 ceph_mds_state_name(state));
877 session->s_renew_requested = jiffies;
878 msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS, 917 msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
879 ++session->s_renew_seq); 918 ++session->s_renew_seq);
880 if (IS_ERR(msg)) 919 if (!msg)
881 return PTR_ERR(msg); 920 return -ENOMEM;
882 ceph_con_send(&session->s_con, msg); 921 ceph_con_send(&session->s_con, msg);
883 return 0; 922 return 0;
884} 923}
@@ -925,17 +964,15 @@ static int request_close_session(struct ceph_mds_client *mdsc,
925 struct ceph_mds_session *session) 964 struct ceph_mds_session *session)
926{ 965{
927 struct ceph_msg *msg; 966 struct ceph_msg *msg;
928 int err = 0;
929 967
930 dout("request_close_session mds%d state %s seq %lld\n", 968 dout("request_close_session mds%d state %s seq %lld\n",
931 session->s_mds, session_state_name(session->s_state), 969 session->s_mds, session_state_name(session->s_state),
932 session->s_seq); 970 session->s_seq);
933 msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq); 971 msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq);
934 if (IS_ERR(msg)) 972 if (!msg)
935 err = PTR_ERR(msg); 973 return -ENOMEM;
936 else 974 ceph_con_send(&session->s_con, msg);
937 ceph_con_send(&session->s_con, msg); 975 return 0;
938 return err;
939} 976}
940 977
941/* 978/*
@@ -1053,7 +1090,7 @@ static int add_cap_releases(struct ceph_mds_client *mdsc,
1053 while (session->s_num_cap_releases < session->s_nr_caps + extra) { 1090 while (session->s_num_cap_releases < session->s_nr_caps + extra) {
1054 spin_unlock(&session->s_cap_lock); 1091 spin_unlock(&session->s_cap_lock);
1055 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE, 1092 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE,
1056 0, 0, NULL); 1093 GFP_NOFS);
1057 if (!msg) 1094 if (!msg)
1058 goto out_unlocked; 1095 goto out_unlocked;
1059 dout("add_cap_releases %p msg %p now %d\n", session, msg, 1096 dout("add_cap_releases %p msg %p now %d\n", session, msg,
@@ -1145,10 +1182,8 @@ static void send_cap_releases(struct ceph_mds_client *mdsc,
1145 struct ceph_msg *msg; 1182 struct ceph_msg *msg;
1146 1183
1147 dout("send_cap_releases mds%d\n", session->s_mds); 1184 dout("send_cap_releases mds%d\n", session->s_mds);
1148 while (1) { 1185 spin_lock(&session->s_cap_lock);
1149 spin_lock(&session->s_cap_lock); 1186 while (!list_empty(&session->s_cap_releases_done)) {
1150 if (list_empty(&session->s_cap_releases_done))
1151 break;
1152 msg = list_first_entry(&session->s_cap_releases_done, 1187 msg = list_first_entry(&session->s_cap_releases_done,
1153 struct ceph_msg, list_head); 1188 struct ceph_msg, list_head);
1154 list_del_init(&msg->list_head); 1189 list_del_init(&msg->list_head);
@@ -1156,7 +1191,46 @@ static void send_cap_releases(struct ceph_mds_client *mdsc,
1156 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); 1191 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
1157 dout("send_cap_releases mds%d %p\n", session->s_mds, msg); 1192 dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
1158 ceph_con_send(&session->s_con, msg); 1193 ceph_con_send(&session->s_con, msg);
1194 spin_lock(&session->s_cap_lock);
1195 }
1196 spin_unlock(&session->s_cap_lock);
1197}
1198
1199static void discard_cap_releases(struct ceph_mds_client *mdsc,
1200 struct ceph_mds_session *session)
1201{
1202 struct ceph_msg *msg;
1203 struct ceph_mds_cap_release *head;
1204 unsigned num;
1205
1206 dout("discard_cap_releases mds%d\n", session->s_mds);
1207 spin_lock(&session->s_cap_lock);
1208
1209 /* zero out the in-progress message */
1210 msg = list_first_entry(&session->s_cap_releases,
1211 struct ceph_msg, list_head);
1212 head = msg->front.iov_base;
1213 num = le32_to_cpu(head->num);
1214 dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, num);
1215 head->num = cpu_to_le32(0);
1216 session->s_num_cap_releases += num;
1217
1218 /* requeue completed messages */
1219 while (!list_empty(&session->s_cap_releases_done)) {
1220 msg = list_first_entry(&session->s_cap_releases_done,
1221 struct ceph_msg, list_head);
1222 list_del_init(&msg->list_head);
1223
1224 head = msg->front.iov_base;
1225 num = le32_to_cpu(head->num);
1226 dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg,
1227 num);
1228 session->s_num_cap_releases += num;
1229 head->num = cpu_to_le32(0);
1230 msg->front.iov_len = sizeof(*head);
1231 list_add(&msg->list_head, &session->s_cap_releases);
1159 } 1232 }
1233
1160 spin_unlock(&session->s_cap_lock); 1234 spin_unlock(&session->s_cap_lock);
1161} 1235}
1162 1236
@@ -1175,6 +1249,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
1175 if (!req) 1249 if (!req)
1176 return ERR_PTR(-ENOMEM); 1250 return ERR_PTR(-ENOMEM);
1177 1251
1252 mutex_init(&req->r_fill_mutex);
1178 req->r_started = jiffies; 1253 req->r_started = jiffies;
1179 req->r_resend_mds = -1; 1254 req->r_resend_mds = -1;
1180 INIT_LIST_HEAD(&req->r_unsafe_dir_item); 1255 INIT_LIST_HEAD(&req->r_unsafe_dir_item);
@@ -1245,7 +1320,7 @@ retry:
1245 len += 1 + temp->d_name.len; 1320 len += 1 + temp->d_name.len;
1246 temp = temp->d_parent; 1321 temp = temp->d_parent;
1247 if (temp == NULL) { 1322 if (temp == NULL) {
1248 pr_err("build_path_dentry corrupt dentry %p\n", dentry); 1323 pr_err("build_path corrupt dentry %p\n", dentry);
1249 return ERR_PTR(-EINVAL); 1324 return ERR_PTR(-EINVAL);
1250 } 1325 }
1251 } 1326 }
@@ -1261,7 +1336,7 @@ retry:
1261 struct inode *inode = temp->d_inode; 1336 struct inode *inode = temp->d_inode;
1262 1337
1263 if (inode && ceph_snap(inode) == CEPH_SNAPDIR) { 1338 if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
1264 dout("build_path_dentry path+%d: %p SNAPDIR\n", 1339 dout("build_path path+%d: %p SNAPDIR\n",
1265 pos, temp); 1340 pos, temp);
1266 } else if (stop_on_nosnap && inode && 1341 } else if (stop_on_nosnap && inode &&
1267 ceph_snap(inode) == CEPH_NOSNAP) { 1342 ceph_snap(inode) == CEPH_NOSNAP) {
@@ -1272,20 +1347,18 @@ retry:
1272 break; 1347 break;
1273 strncpy(path + pos, temp->d_name.name, 1348 strncpy(path + pos, temp->d_name.name,
1274 temp->d_name.len); 1349 temp->d_name.len);
1275 dout("build_path_dentry path+%d: %p '%.*s'\n",
1276 pos, temp, temp->d_name.len, path + pos);
1277 } 1350 }
1278 if (pos) 1351 if (pos)
1279 path[--pos] = '/'; 1352 path[--pos] = '/';
1280 temp = temp->d_parent; 1353 temp = temp->d_parent;
1281 if (temp == NULL) { 1354 if (temp == NULL) {
1282 pr_err("build_path_dentry corrupt dentry\n"); 1355 pr_err("build_path corrupt dentry\n");
1283 kfree(path); 1356 kfree(path);
1284 return ERR_PTR(-EINVAL); 1357 return ERR_PTR(-EINVAL);
1285 } 1358 }
1286 } 1359 }
1287 if (pos != 0) { 1360 if (pos != 0) {
1288 pr_err("build_path_dentry did not end path lookup where " 1361 pr_err("build_path did not end path lookup where "
1289 "expected, namelen is %d, pos is %d\n", len, pos); 1362 "expected, namelen is %d, pos is %d\n", len, pos);
1290 /* presumably this is only possible if racing with a 1363 /* presumably this is only possible if racing with a
1291 rename of one of the parent directories (we can not 1364 rename of one of the parent directories (we can not
@@ -1297,7 +1370,7 @@ retry:
1297 1370
1298 *base = ceph_ino(temp->d_inode); 1371 *base = ceph_ino(temp->d_inode);
1299 *plen = len; 1372 *plen = len;
1300 dout("build_path_dentry on %p %d built %llx '%.*s'\n", 1373 dout("build_path on %p %d built %llx '%.*s'\n",
1301 dentry, atomic_read(&dentry->d_count), *base, len, path); 1374 dentry, atomic_read(&dentry->d_count), *base, len, path);
1302 return path; 1375 return path;
1303} 1376}
@@ -1420,9 +1493,11 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
1420 if (req->r_old_dentry_drop) 1493 if (req->r_old_dentry_drop)
1421 len += req->r_old_dentry->d_name.len; 1494 len += req->r_old_dentry->d_name.len;
1422 1495
1423 msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, 0, 0, NULL); 1496 msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS);
1424 if (IS_ERR(msg)) 1497 if (!msg) {
1498 msg = ERR_PTR(-ENOMEM);
1425 goto out_free2; 1499 goto out_free2;
1500 }
1426 1501
1427 msg->hdr.tid = cpu_to_le64(req->r_tid); 1502 msg->hdr.tid = cpu_to_le64(req->r_tid);
1428 1503
@@ -1511,9 +1586,9 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
1511 } 1586 }
1512 msg = create_request_message(mdsc, req, mds); 1587 msg = create_request_message(mdsc, req, mds);
1513 if (IS_ERR(msg)) { 1588 if (IS_ERR(msg)) {
1514 req->r_reply = ERR_PTR(PTR_ERR(msg)); 1589 req->r_err = PTR_ERR(msg);
1515 complete_request(mdsc, req); 1590 complete_request(mdsc, req);
1516 return -PTR_ERR(msg); 1591 return PTR_ERR(msg);
1517 } 1592 }
1518 req->r_request = msg; 1593 req->r_request = msg;
1519 1594
@@ -1546,7 +1621,7 @@ static int __do_request(struct ceph_mds_client *mdsc,
1546 int mds = -1; 1621 int mds = -1;
1547 int err = -EAGAIN; 1622 int err = -EAGAIN;
1548 1623
1549 if (req->r_reply) 1624 if (req->r_err || req->r_got_result)
1550 goto out; 1625 goto out;
1551 1626
1552 if (req->r_timeout && 1627 if (req->r_timeout &&
@@ -1566,8 +1641,13 @@ static int __do_request(struct ceph_mds_client *mdsc,
1566 1641
1567 /* get, open session */ 1642 /* get, open session */
1568 session = __ceph_lookup_mds_session(mdsc, mds); 1643 session = __ceph_lookup_mds_session(mdsc, mds);
1569 if (!session) 1644 if (!session) {
1570 session = register_session(mdsc, mds); 1645 session = register_session(mdsc, mds);
1646 if (IS_ERR(session)) {
1647 err = PTR_ERR(session);
1648 goto finish;
1649 }
1650 }
1571 dout("do_request mds%d session %p state %s\n", mds, session, 1651 dout("do_request mds%d session %p state %s\n", mds, session,
1572 session_state_name(session->s_state)); 1652 session_state_name(session->s_state));
1573 if (session->s_state != CEPH_MDS_SESSION_OPEN && 1653 if (session->s_state != CEPH_MDS_SESSION_OPEN &&
@@ -1598,7 +1678,7 @@ out:
1598 return err; 1678 return err;
1599 1679
1600finish: 1680finish:
1601 req->r_reply = ERR_PTR(err); 1681 req->r_err = err;
1602 complete_request(mdsc, req); 1682 complete_request(mdsc, req);
1603 goto out; 1683 goto out;
1604} 1684}
@@ -1619,10 +1699,9 @@ static void __wake_requests(struct ceph_mds_client *mdsc,
1619 1699
1620/* 1700/*
1621 * Wake up threads with requests pending for @mds, so that they can 1701 * Wake up threads with requests pending for @mds, so that they can
1622 * resubmit their requests to a possibly different mds. If @all is set, 1702 * resubmit their requests to a possibly different mds.
1623 * wake up if their requests has been forwarded to @mds, too.
1624 */ 1703 */
1625static void kick_requests(struct ceph_mds_client *mdsc, int mds, int all) 1704static void kick_requests(struct ceph_mds_client *mdsc, int mds)
1626{ 1705{
1627 struct ceph_mds_request *req; 1706 struct ceph_mds_request *req;
1628 struct rb_node *p; 1707 struct rb_node *p;
@@ -1678,64 +1757,78 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
1678 __register_request(mdsc, req, dir); 1757 __register_request(mdsc, req, dir);
1679 __do_request(mdsc, req); 1758 __do_request(mdsc, req);
1680 1759
1681 /* wait */ 1760 if (req->r_err) {
1682 if (!req->r_reply) { 1761 err = req->r_err;
1683 mutex_unlock(&mdsc->mutex); 1762 __unregister_request(mdsc, req);
1684 if (req->r_timeout) { 1763 dout("do_request early error %d\n", err);
1685 err = (long)wait_for_completion_interruptible_timeout( 1764 goto out;
1686 &req->r_completion, req->r_timeout);
1687 if (err == 0)
1688 req->r_reply = ERR_PTR(-EIO);
1689 else if (err < 0)
1690 req->r_reply = ERR_PTR(err);
1691 } else {
1692 err = wait_for_completion_interruptible(
1693 &req->r_completion);
1694 if (err)
1695 req->r_reply = ERR_PTR(err);
1696 }
1697 mutex_lock(&mdsc->mutex);
1698 } 1765 }
1699 1766
1700 if (IS_ERR(req->r_reply)) { 1767 /* wait */
1701 err = PTR_ERR(req->r_reply); 1768 mutex_unlock(&mdsc->mutex);
1702 req->r_reply = NULL; 1769 dout("do_request waiting\n");
1770 if (req->r_timeout) {
1771 err = (long)wait_for_completion_killable_timeout(
1772 &req->r_completion, req->r_timeout);
1773 if (err == 0)
1774 err = -EIO;
1775 } else {
1776 err = wait_for_completion_killable(&req->r_completion);
1777 }
1778 dout("do_request waited, got %d\n", err);
1779 mutex_lock(&mdsc->mutex);
1703 1780
1704 if (err == -ERESTARTSYS) { 1781 /* only abort if we didn't race with a real reply */
1705 /* aborted */ 1782 if (req->r_got_result) {
1706 req->r_aborted = true; 1783 err = le32_to_cpu(req->r_reply_info.head->result);
1784 } else if (err < 0) {
1785 dout("aborted request %lld with %d\n", req->r_tid, err);
1707 1786
1708 if (req->r_locked_dir && 1787 /*
1709 (req->r_op & CEPH_MDS_OP_WRITE)) { 1788 * ensure we aren't running concurrently with
1710 struct ceph_inode_info *ci = 1789 * ceph_fill_trace or ceph_readdir_prepopulate, which
1711 ceph_inode(req->r_locked_dir); 1790 * rely on locks (dir mutex) held by our caller.
1791 */
1792 mutex_lock(&req->r_fill_mutex);
1793 req->r_err = err;
1794 req->r_aborted = true;
1795 mutex_unlock(&req->r_fill_mutex);
1712 1796
1713 dout("aborted, clearing I_COMPLETE on %p\n", 1797 if (req->r_locked_dir &&
1714 req->r_locked_dir); 1798 (req->r_op & CEPH_MDS_OP_WRITE))
1715 spin_lock(&req->r_locked_dir->i_lock); 1799 ceph_invalidate_dir_request(req);
1716 ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
1717 ci->i_release_count++;
1718 spin_unlock(&req->r_locked_dir->i_lock);
1719 }
1720 } else {
1721 /* clean up this request */
1722 __unregister_request(mdsc, req);
1723 if (!list_empty(&req->r_unsafe_item))
1724 list_del_init(&req->r_unsafe_item);
1725 complete(&req->r_safe_completion);
1726 }
1727 } else if (req->r_err) {
1728 err = req->r_err;
1729 } else { 1800 } else {
1730 err = le32_to_cpu(req->r_reply_info.head->result); 1801 err = req->r_err;
1731 } 1802 }
1732 mutex_unlock(&mdsc->mutex);
1733 1803
1804out:
1805 mutex_unlock(&mdsc->mutex);
1734 dout("do_request %p done, result %d\n", req, err); 1806 dout("do_request %p done, result %d\n", req, err);
1735 return err; 1807 return err;
1736} 1808}
1737 1809
1738/* 1810/*
1811 * Invalidate dir I_COMPLETE, dentry lease state on an aborted MDS
1812 * namespace request.
1813 */
1814void ceph_invalidate_dir_request(struct ceph_mds_request *req)
1815{
1816 struct inode *inode = req->r_locked_dir;
1817 struct ceph_inode_info *ci = ceph_inode(inode);
1818
1819 dout("invalidate_dir_request %p (I_COMPLETE, lease(s))\n", inode);
1820 spin_lock(&inode->i_lock);
1821 ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
1822 ci->i_release_count++;
1823 spin_unlock(&inode->i_lock);
1824
1825 if (req->r_dentry)
1826 ceph_invalidate_dentry_lease(req->r_dentry);
1827 if (req->r_old_dentry)
1828 ceph_invalidate_dentry_lease(req->r_old_dentry);
1829}
1830
1831/*
1739 * Handle mds reply. 1832 * Handle mds reply.
1740 * 1833 *
1741 * We take the session mutex and parse and process the reply immediately. 1834 * We take the session mutex and parse and process the reply immediately.
@@ -1770,7 +1863,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
1770 dout("handle_reply %p\n", req); 1863 dout("handle_reply %p\n", req);
1771 1864
1772 /* correct session? */ 1865 /* correct session? */
1773 if (!req->r_session && req->r_session != session) { 1866 if (req->r_session != session) {
1774 pr_err("mdsc_handle_reply got %llu on session mds%d" 1867 pr_err("mdsc_handle_reply got %llu on session mds%d"
1775 " not mds%d\n", tid, session->s_mds, 1868 " not mds%d\n", tid, session->s_mds,
1776 req->r_session ? req->r_session->s_mds : -1); 1869 req->r_session ? req->r_session->s_mds : -1);
@@ -1786,6 +1879,12 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
1786 mutex_unlock(&mdsc->mutex); 1879 mutex_unlock(&mdsc->mutex);
1787 goto out; 1880 goto out;
1788 } 1881 }
1882 if (req->r_got_safe && !head->safe) {
1883 pr_warning("got unsafe after safe on %llu from mds%d\n",
1884 tid, mds);
1885 mutex_unlock(&mdsc->mutex);
1886 goto out;
1887 }
1789 1888
1790 result = le32_to_cpu(head->result); 1889 result = le32_to_cpu(head->result);
1791 1890
@@ -1827,11 +1926,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
1827 mutex_unlock(&mdsc->mutex); 1926 mutex_unlock(&mdsc->mutex);
1828 goto out; 1927 goto out;
1829 } 1928 }
1830 } 1929 } else {
1831
1832 BUG_ON(req->r_reply);
1833
1834 if (!head->safe) {
1835 req->r_got_unsafe = true; 1930 req->r_got_unsafe = true;
1836 list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe); 1931 list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
1837 } 1932 }
@@ -1860,21 +1955,30 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
1860 } 1955 }
1861 1956
1862 /* insert trace into our cache */ 1957 /* insert trace into our cache */
1958 mutex_lock(&req->r_fill_mutex);
1863 err = ceph_fill_trace(mdsc->client->sb, req, req->r_session); 1959 err = ceph_fill_trace(mdsc->client->sb, req, req->r_session);
1864 if (err == 0) { 1960 if (err == 0) {
1865 if (result == 0 && rinfo->dir_nr) 1961 if (result == 0 && rinfo->dir_nr)
1866 ceph_readdir_prepopulate(req, req->r_session); 1962 ceph_readdir_prepopulate(req, req->r_session);
1867 ceph_unreserve_caps(&req->r_caps_reservation); 1963 ceph_unreserve_caps(&req->r_caps_reservation);
1868 } 1964 }
1965 mutex_unlock(&req->r_fill_mutex);
1869 1966
1870 up_read(&mdsc->snap_rwsem); 1967 up_read(&mdsc->snap_rwsem);
1871out_err: 1968out_err:
1872 if (err) { 1969 mutex_lock(&mdsc->mutex);
1873 req->r_err = err; 1970 if (!req->r_aborted) {
1971 if (err) {
1972 req->r_err = err;
1973 } else {
1974 req->r_reply = msg;
1975 ceph_msg_get(msg);
1976 req->r_got_result = true;
1977 }
1874 } else { 1978 } else {
1875 req->r_reply = msg; 1979 dout("reply arrived after request %lld was aborted\n", tid);
1876 ceph_msg_get(msg);
1877 } 1980 }
1981 mutex_unlock(&mdsc->mutex);
1878 1982
1879 add_cap_releases(mdsc, req->r_session, -1); 1983 add_cap_releases(mdsc, req->r_session, -1);
1880 mutex_unlock(&session->s_mutex); 1984 mutex_unlock(&session->s_mutex);
@@ -1910,16 +2014,21 @@ static void handle_forward(struct ceph_mds_client *mdsc,
1910 mutex_lock(&mdsc->mutex); 2014 mutex_lock(&mdsc->mutex);
1911 req = __lookup_request(mdsc, tid); 2015 req = __lookup_request(mdsc, tid);
1912 if (!req) { 2016 if (!req) {
1913 dout("forward %llu to mds%d - req dne\n", tid, next_mds); 2017 dout("forward tid %llu to mds%d - req dne\n", tid, next_mds);
1914 goto out; /* dup reply? */ 2018 goto out; /* dup reply? */
1915 } 2019 }
1916 2020
1917 if (fwd_seq <= req->r_num_fwd) { 2021 if (req->r_aborted) {
1918 dout("forward %llu to mds%d - old seq %d <= %d\n", 2022 dout("forward tid %llu aborted, unregistering\n", tid);
2023 __unregister_request(mdsc, req);
2024 } else if (fwd_seq <= req->r_num_fwd) {
2025 dout("forward tid %llu to mds%d - old seq %d <= %d\n",
1919 tid, next_mds, req->r_num_fwd, fwd_seq); 2026 tid, next_mds, req->r_num_fwd, fwd_seq);
1920 } else { 2027 } else {
1921 /* resend. forward race not possible; mds would drop */ 2028 /* resend. forward race not possible; mds would drop */
1922 dout("forward %llu to mds%d (we resend)\n", tid, next_mds); 2029 dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds);
2030 BUG_ON(req->r_err);
2031 BUG_ON(req->r_got_result);
1923 req->r_num_fwd = fwd_seq; 2032 req->r_num_fwd = fwd_seq;
1924 req->r_resend_mds = next_mds; 2033 req->r_resend_mds = next_mds;
1925 put_request_session(req); 2034 put_request_session(req);
@@ -1973,6 +2082,8 @@ static void handle_session(struct ceph_mds_session *session,
1973 2082
1974 switch (op) { 2083 switch (op) {
1975 case CEPH_SESSION_OPEN: 2084 case CEPH_SESSION_OPEN:
2085 if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
2086 pr_info("mds%d reconnect success\n", session->s_mds);
1976 session->s_state = CEPH_MDS_SESSION_OPEN; 2087 session->s_state = CEPH_MDS_SESSION_OPEN;
1977 renewed_caps(mdsc, session, 0); 2088 renewed_caps(mdsc, session, 0);
1978 wake = 1; 2089 wake = 1;
@@ -1986,10 +2097,12 @@ static void handle_session(struct ceph_mds_session *session,
1986 break; 2097 break;
1987 2098
1988 case CEPH_SESSION_CLOSE: 2099 case CEPH_SESSION_CLOSE:
2100 if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
2101 pr_info("mds%d reconnect denied\n", session->s_mds);
1989 remove_session_caps(session); 2102 remove_session_caps(session);
1990 wake = 1; /* for good measure */ 2103 wake = 1; /* for good measure */
1991 complete(&mdsc->session_close_waiters); 2104 complete(&mdsc->session_close_waiters);
1992 kick_requests(mdsc, mds, 0); /* cur only */ 2105 kick_requests(mdsc, mds);
1993 break; 2106 break;
1994 2107
1995 case CEPH_SESSION_STALE: 2108 case CEPH_SESSION_STALE:
@@ -2121,61 +2234,51 @@ out:
2121 * 2234 *
2122 * called with mdsc->mutex held. 2235 * called with mdsc->mutex held.
2123 */ 2236 */
2124static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds) 2237static void send_mds_reconnect(struct ceph_mds_client *mdsc,
2238 struct ceph_mds_session *session)
2125{ 2239{
2126 struct ceph_mds_session *session = NULL;
2127 struct ceph_msg *reply; 2240 struct ceph_msg *reply;
2128 struct rb_node *p; 2241 struct rb_node *p;
2129 int err; 2242 int mds = session->s_mds;
2243 int err = -ENOMEM;
2130 struct ceph_pagelist *pagelist; 2244 struct ceph_pagelist *pagelist;
2131 2245
2132 pr_info("reconnect to recovering mds%d\n", mds); 2246 pr_info("mds%d reconnect start\n", mds);
2133 2247
2134 pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS); 2248 pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
2135 if (!pagelist) 2249 if (!pagelist)
2136 goto fail_nopagelist; 2250 goto fail_nopagelist;
2137 ceph_pagelist_init(pagelist); 2251 ceph_pagelist_init(pagelist);
2138 2252
2139 reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, 0, 0, NULL); 2253 reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS);
2140 if (IS_ERR(reply)) { 2254 if (!reply)
2141 err = PTR_ERR(reply);
2142 goto fail_nomsg; 2255 goto fail_nomsg;
2143 }
2144
2145 /* find session */
2146 session = __ceph_lookup_mds_session(mdsc, mds);
2147 mutex_unlock(&mdsc->mutex); /* drop lock for duration */
2148
2149 if (session) {
2150 mutex_lock(&session->s_mutex);
2151 2256
2152 session->s_state = CEPH_MDS_SESSION_RECONNECTING; 2257 mutex_lock(&session->s_mutex);
2153 session->s_seq = 0; 2258 session->s_state = CEPH_MDS_SESSION_RECONNECTING;
2259 session->s_seq = 0;
2154 2260
2155 ceph_con_open(&session->s_con, 2261 ceph_con_open(&session->s_con,
2156 ceph_mdsmap_get_addr(mdsc->mdsmap, mds)); 2262 ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
2157 2263
2158 /* replay unsafe requests */ 2264 /* replay unsafe requests */
2159 replay_unsafe_requests(mdsc, session); 2265 replay_unsafe_requests(mdsc, session);
2160 } else {
2161 dout("no session for mds%d, will send short reconnect\n",
2162 mds);
2163 }
2164 2266
2165 down_read(&mdsc->snap_rwsem); 2267 down_read(&mdsc->snap_rwsem);
2166 2268
2167 if (!session)
2168 goto send;
2169 dout("session %p state %s\n", session, 2269 dout("session %p state %s\n", session,
2170 session_state_name(session->s_state)); 2270 session_state_name(session->s_state));
2171 2271
2272 /* drop old cap expires; we're about to reestablish that state */
2273 discard_cap_releases(mdsc, session);
2274
2172 /* traverse this session's caps */ 2275 /* traverse this session's caps */
2173 err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps); 2276 err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps);
2174 if (err) 2277 if (err)
2175 goto fail; 2278 goto fail;
2176 err = iterate_session_caps(session, encode_caps_cb, pagelist); 2279 err = iterate_session_caps(session, encode_caps_cb, pagelist);
2177 if (err < 0) 2280 if (err < 0)
2178 goto out; 2281 goto fail;
2179 2282
2180 /* 2283 /*
2181 * snaprealms. we provide mds with the ino, seq (version), and 2284 * snaprealms. we provide mds with the ino, seq (version), and
@@ -2197,34 +2300,30 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds)
2197 goto fail; 2300 goto fail;
2198 } 2301 }
2199 2302
2200send:
2201 reply->pagelist = pagelist; 2303 reply->pagelist = pagelist;
2202 reply->hdr.data_len = cpu_to_le32(pagelist->length); 2304 reply->hdr.data_len = cpu_to_le32(pagelist->length);
2203 reply->nr_pages = calc_pages_for(0, pagelist->length); 2305 reply->nr_pages = calc_pages_for(0, pagelist->length);
2204 ceph_con_send(&session->s_con, reply); 2306 ceph_con_send(&session->s_con, reply);
2205 2307
2206 if (session) { 2308 mutex_unlock(&session->s_mutex);
2207 session->s_state = CEPH_MDS_SESSION_OPEN;
2208 __wake_requests(mdsc, &session->s_waiting);
2209 }
2210 2309
2211out:
2212 up_read(&mdsc->snap_rwsem);
2213 if (session) {
2214 mutex_unlock(&session->s_mutex);
2215 ceph_put_mds_session(session);
2216 }
2217 mutex_lock(&mdsc->mutex); 2310 mutex_lock(&mdsc->mutex);
2311 __wake_requests(mdsc, &session->s_waiting);
2312 mutex_unlock(&mdsc->mutex);
2313
2314 up_read(&mdsc->snap_rwsem);
2218 return; 2315 return;
2219 2316
2220fail: 2317fail:
2221 ceph_msg_put(reply); 2318 ceph_msg_put(reply);
2319 up_read(&mdsc->snap_rwsem);
2320 mutex_unlock(&session->s_mutex);
2222fail_nomsg: 2321fail_nomsg:
2223 ceph_pagelist_release(pagelist); 2322 ceph_pagelist_release(pagelist);
2224 kfree(pagelist); 2323 kfree(pagelist);
2225fail_nopagelist: 2324fail_nopagelist:
2226 pr_err("ENOMEM preparing reconnect for mds%d\n", mds); 2325 pr_err("error %d preparing reconnect for mds%d\n", err, mds);
2227 goto out; 2326 return;
2228} 2327}
2229 2328
2230 2329
@@ -2276,7 +2375,7 @@ static void check_new_map(struct ceph_mds_client *mdsc,
2276 } 2375 }
2277 2376
2278 /* kick any requests waiting on the recovering mds */ 2377 /* kick any requests waiting on the recovering mds */
2279 kick_requests(mdsc, i, 1); 2378 kick_requests(mdsc, i);
2280 } else if (oldstate == newstate) { 2379 } else if (oldstate == newstate) {
2281 continue; /* nothing new with this mds */ 2380 continue; /* nothing new with this mds */
2282 } 2381 }
@@ -2285,22 +2384,21 @@ static void check_new_map(struct ceph_mds_client *mdsc,
2285 * send reconnect? 2384 * send reconnect?
2286 */ 2385 */
2287 if (s->s_state == CEPH_MDS_SESSION_RESTARTING && 2386 if (s->s_state == CEPH_MDS_SESSION_RESTARTING &&
2288 newstate >= CEPH_MDS_STATE_RECONNECT) 2387 newstate >= CEPH_MDS_STATE_RECONNECT) {
2289 send_mds_reconnect(mdsc, i); 2388 mutex_unlock(&mdsc->mutex);
2389 send_mds_reconnect(mdsc, s);
2390 mutex_lock(&mdsc->mutex);
2391 }
2290 2392
2291 /* 2393 /*
2292 * kick requests on any mds that has gone active. 2394 * kick request on any mds that has gone active.
2293 *
2294 * kick requests on cur or forwarder: we may have sent
2295 * the request to mds1, mds1 told us it forwarded it
2296 * to mds2, but then we learn mds1 failed and can't be
2297 * sure it successfully forwarded our request before
2298 * it died.
2299 */ 2395 */
2300 if (oldstate < CEPH_MDS_STATE_ACTIVE && 2396 if (oldstate < CEPH_MDS_STATE_ACTIVE &&
2301 newstate >= CEPH_MDS_STATE_ACTIVE) { 2397 newstate >= CEPH_MDS_STATE_ACTIVE) {
2302 pr_info("mds%d reconnect completed\n", s->s_mds); 2398 if (oldstate != CEPH_MDS_STATE_CREATING &&
2303 kick_requests(mdsc, i, 1); 2399 oldstate != CEPH_MDS_STATE_STARTING)
2400 pr_info("mds%d recovery completed\n", s->s_mds);
2401 kick_requests(mdsc, i);
2304 ceph_kick_flushing_caps(mdsc, s); 2402 ceph_kick_flushing_caps(mdsc, s);
2305 wake_up_session_caps(s, 1); 2403 wake_up_session_caps(s, 1);
2306 } 2404 }
@@ -2443,12 +2541,12 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
2443 dnamelen = dentry->d_name.len; 2541 dnamelen = dentry->d_name.len;
2444 len += dnamelen; 2542 len += dnamelen;
2445 2543
2446 msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, 0, 0, NULL); 2544 msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS);
2447 if (IS_ERR(msg)) 2545 if (!msg)
2448 return; 2546 return;
2449 lease = msg->front.iov_base; 2547 lease = msg->front.iov_base;
2450 lease->action = action; 2548 lease->action = action;
2451 lease->mask = cpu_to_le16(CEPH_LOCK_DN); 2549 lease->mask = cpu_to_le16(1);
2452 lease->ino = cpu_to_le64(ceph_vino(inode).ino); 2550 lease->ino = cpu_to_le64(ceph_vino(inode).ino);
2453 lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap); 2551 lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap);
2454 lease->seq = cpu_to_le32(seq); 2552 lease->seq = cpu_to_le32(seq);
@@ -2478,7 +2576,7 @@ void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode,
2478 2576
2479 BUG_ON(inode == NULL); 2577 BUG_ON(inode == NULL);
2480 BUG_ON(dentry == NULL); 2578 BUG_ON(dentry == NULL);
2481 BUG_ON(mask != CEPH_LOCK_DN); 2579 BUG_ON(mask == 0);
2482 2580
2483 /* is dentry lease valid? */ 2581 /* is dentry lease valid? */
2484 spin_lock(&dentry->d_lock); 2582 spin_lock(&dentry->d_lock);
@@ -2589,7 +2687,9 @@ static void delayed_work(struct work_struct *work)
2589 else 2687 else
2590 ceph_con_keepalive(&s->s_con); 2688 ceph_con_keepalive(&s->s_con);
2591 add_cap_releases(mdsc, s, -1); 2689 add_cap_releases(mdsc, s, -1);
2592 send_cap_releases(mdsc, s); 2690 if (s->s_state == CEPH_MDS_SESSION_OPEN ||
2691 s->s_state == CEPH_MDS_SESSION_HUNG)
2692 send_cap_releases(mdsc, s);
2593 mutex_unlock(&s->s_mutex); 2693 mutex_unlock(&s->s_mutex);
2594 ceph_put_mds_session(s); 2694 ceph_put_mds_session(s);
2595 2695
@@ -2606,6 +2706,9 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
2606 mdsc->client = client; 2706 mdsc->client = client;
2607 mutex_init(&mdsc->mutex); 2707 mutex_init(&mdsc->mutex);
2608 mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS); 2708 mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
2709 if (mdsc->mdsmap == NULL)
2710 return -ENOMEM;
2711
2609 init_completion(&mdsc->safe_umount_waiters); 2712 init_completion(&mdsc->safe_umount_waiters);
2610 init_completion(&mdsc->session_close_waiters); 2713 init_completion(&mdsc->session_close_waiters);
2611 INIT_LIST_HEAD(&mdsc->waiting_for_map); 2714 INIT_LIST_HEAD(&mdsc->waiting_for_map);
@@ -2631,6 +2734,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
2631 init_waitqueue_head(&mdsc->cap_flushing_wq); 2734 init_waitqueue_head(&mdsc->cap_flushing_wq);
2632 spin_lock_init(&mdsc->dentry_lru_lock); 2735 spin_lock_init(&mdsc->dentry_lru_lock);
2633 INIT_LIST_HEAD(&mdsc->dentry_lru); 2736 INIT_LIST_HEAD(&mdsc->dentry_lru);
2737
2634 return 0; 2738 return 0;
2635} 2739}
2636 2740
@@ -2682,29 +2786,41 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
2682 */ 2786 */
2683static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid) 2787static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid)
2684{ 2788{
2685 struct ceph_mds_request *req = NULL; 2789 struct ceph_mds_request *req = NULL, *nextreq;
2686 struct rb_node *n; 2790 struct rb_node *n;
2687 2791
2688 mutex_lock(&mdsc->mutex); 2792 mutex_lock(&mdsc->mutex);
2689 dout("wait_unsafe_requests want %lld\n", want_tid); 2793 dout("wait_unsafe_requests want %lld\n", want_tid);
2794restart:
2690 req = __get_oldest_req(mdsc); 2795 req = __get_oldest_req(mdsc);
2691 while (req && req->r_tid <= want_tid) { 2796 while (req && req->r_tid <= want_tid) {
2797 /* find next request */
2798 n = rb_next(&req->r_node);
2799 if (n)
2800 nextreq = rb_entry(n, struct ceph_mds_request, r_node);
2801 else
2802 nextreq = NULL;
2692 if ((req->r_op & CEPH_MDS_OP_WRITE)) { 2803 if ((req->r_op & CEPH_MDS_OP_WRITE)) {
2693 /* write op */ 2804 /* write op */
2694 ceph_mdsc_get_request(req); 2805 ceph_mdsc_get_request(req);
2806 if (nextreq)
2807 ceph_mdsc_get_request(nextreq);
2695 mutex_unlock(&mdsc->mutex); 2808 mutex_unlock(&mdsc->mutex);
2696 dout("wait_unsafe_requests wait on %llu (want %llu)\n", 2809 dout("wait_unsafe_requests wait on %llu (want %llu)\n",
2697 req->r_tid, want_tid); 2810 req->r_tid, want_tid);
2698 wait_for_completion(&req->r_safe_completion); 2811 wait_for_completion(&req->r_safe_completion);
2699 mutex_lock(&mdsc->mutex); 2812 mutex_lock(&mdsc->mutex);
2700 n = rb_next(&req->r_node);
2701 ceph_mdsc_put_request(req); 2813 ceph_mdsc_put_request(req);
2702 } else { 2814 if (!nextreq)
2703 n = rb_next(&req->r_node); 2815 break; /* next dne before, so we're done! */
2816 if (RB_EMPTY_NODE(&nextreq->r_node)) {
2817 /* next request was removed from tree */
2818 ceph_mdsc_put_request(nextreq);
2819 goto restart;
2820 }
2821 ceph_mdsc_put_request(nextreq); /* won't go away */
2704 } 2822 }
2705 if (!n) 2823 req = nextreq;
2706 break;
2707 req = rb_entry(n, struct ceph_mds_request, r_node);
2708 } 2824 }
2709 mutex_unlock(&mdsc->mutex); 2825 mutex_unlock(&mdsc->mutex);
2710 dout("wait_unsafe_requests done\n"); 2826 dout("wait_unsafe_requests done\n");
@@ -2714,6 +2830,9 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
2714{ 2830{
2715 u64 want_tid, want_flush; 2831 u64 want_tid, want_flush;
2716 2832
2833 if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN)
2834 return;
2835
2717 dout("sync\n"); 2836 dout("sync\n");
2718 mutex_lock(&mdsc->mutex); 2837 mutex_lock(&mdsc->mutex);
2719 want_tid = mdsc->last_tid; 2838 want_tid = mdsc->last_tid;
@@ -2896,9 +3015,10 @@ static void con_put(struct ceph_connection *con)
2896static void peer_reset(struct ceph_connection *con) 3015static void peer_reset(struct ceph_connection *con)
2897{ 3016{
2898 struct ceph_mds_session *s = con->private; 3017 struct ceph_mds_session *s = con->private;
3018 struct ceph_mds_client *mdsc = s->s_mdsc;
2899 3019
2900 pr_err("mds%d gave us the boot. IMPLEMENT RECONNECT.\n", 3020 pr_warning("mds%d closed our session\n", s->s_mds);
2901 s->s_mds); 3021 send_mds_reconnect(mdsc, s);
2902} 3022}
2903 3023
2904static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) 3024static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
@@ -3005,7 +3125,7 @@ static int invalidate_authorizer(struct ceph_connection *con)
3005 return ceph_monc_validate_auth(&mdsc->client->monc); 3125 return ceph_monc_validate_auth(&mdsc->client->monc);
3006} 3126}
3007 3127
3008const static struct ceph_connection_operations mds_con_ops = { 3128static const struct ceph_connection_operations mds_con_ops = {
3009 .get = con_get, 3129 .get = con_get,
3010 .put = con_put, 3130 .put = con_put,
3011 .dispatch = dispatch, 3131 .dispatch = dispatch,
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 961cc6f65878..d9936c4f1212 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -165,6 +165,8 @@ struct ceph_mds_request {
165 struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */ 165 struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */
166 struct inode *r_target_inode; /* resulting inode */ 166 struct inode *r_target_inode; /* resulting inode */
167 167
168 struct mutex r_fill_mutex;
169
168 union ceph_mds_request_args r_args; 170 union ceph_mds_request_args r_args;
169 int r_fmode; /* file mode, if expecting cap */ 171 int r_fmode; /* file mode, if expecting cap */
170 172
@@ -213,7 +215,7 @@ struct ceph_mds_request {
213 struct completion r_safe_completion; 215 struct completion r_safe_completion;
214 ceph_mds_request_callback_t r_callback; 216 ceph_mds_request_callback_t r_callback;
215 struct list_head r_unsafe_item; /* per-session unsafe list item */ 217 struct list_head r_unsafe_item; /* per-session unsafe list item */
216 bool r_got_unsafe, r_got_safe; 218 bool r_got_unsafe, r_got_safe, r_got_result;
217 219
218 bool r_did_prepopulate; 220 bool r_did_prepopulate;
219 u32 r_readdir_offset; 221 u32 r_readdir_offset;
@@ -301,6 +303,8 @@ extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc,
301 struct inode *inode, 303 struct inode *inode,
302 struct dentry *dn, int mask); 304 struct dentry *dn, int mask);
303 305
306extern void ceph_invalidate_dir_request(struct ceph_mds_request *req);
307
304extern struct ceph_mds_request * 308extern struct ceph_mds_request *
305ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode); 309ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
306extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, 310extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index 781656a49bf8..64b8b1f7863d 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -6,6 +6,7 @@
6#include <linux/inet.h> 6#include <linux/inet.h>
7#include <linux/kthread.h> 7#include <linux/kthread.h>
8#include <linux/net.h> 8#include <linux/net.h>
9#include <linux/slab.h>
9#include <linux/socket.h> 10#include <linux/socket.h>
10#include <linux/string.h> 11#include <linux/string.h>
11#include <net/tcp.h> 12#include <net/tcp.h>
@@ -29,23 +30,15 @@ static char tag_msg = CEPH_MSGR_TAG_MSG;
29static char tag_ack = CEPH_MSGR_TAG_ACK; 30static char tag_ack = CEPH_MSGR_TAG_ACK;
30static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE; 31static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
31 32
33#ifdef CONFIG_LOCKDEP
34static struct lock_class_key socket_class;
35#endif
36
32 37
33static void queue_con(struct ceph_connection *con); 38static void queue_con(struct ceph_connection *con);
34static void con_work(struct work_struct *); 39static void con_work(struct work_struct *);
35static void ceph_fault(struct ceph_connection *con); 40static void ceph_fault(struct ceph_connection *con);
36 41
37const char *ceph_name_type_str(int t)
38{
39 switch (t) {
40 case CEPH_ENTITY_TYPE_MON: return "mon";
41 case CEPH_ENTITY_TYPE_MDS: return "mds";
42 case CEPH_ENTITY_TYPE_OSD: return "osd";
43 case CEPH_ENTITY_TYPE_CLIENT: return "client";
44 case CEPH_ENTITY_TYPE_ADMIN: return "admin";
45 default: return "???";
46 }
47}
48
49/* 42/*
50 * nicely render a sockaddr as a string. 43 * nicely render a sockaddr as a string.
51 */ 44 */
@@ -127,6 +120,12 @@ void ceph_msgr_exit(void)
127 destroy_workqueue(ceph_msgr_wq); 120 destroy_workqueue(ceph_msgr_wq);
128} 121}
129 122
123void ceph_msgr_flush()
124{
125 flush_workqueue(ceph_msgr_wq);
126}
127
128
130/* 129/*
131 * socket callback functions 130 * socket callback functions
132 */ 131 */
@@ -227,6 +226,10 @@ static struct socket *ceph_tcp_connect(struct ceph_connection *con)
227 con->sock = sock; 226 con->sock = sock;
228 sock->sk->sk_allocation = GFP_NOFS; 227 sock->sk->sk_allocation = GFP_NOFS;
229 228
229#ifdef CONFIG_LOCKDEP
230 lockdep_set_class(&sock->sk->sk_lock, &socket_class);
231#endif
232
230 set_sock_callbacks(sock, con); 233 set_sock_callbacks(sock, con);
231 234
232 dout("connect %s\n", pr_addr(&con->peer_addr.in_addr)); 235 dout("connect %s\n", pr_addr(&con->peer_addr.in_addr));
@@ -331,7 +334,9 @@ static void reset_connection(struct ceph_connection *con)
331 ceph_msg_put(con->out_msg); 334 ceph_msg_put(con->out_msg);
332 con->out_msg = NULL; 335 con->out_msg = NULL;
333 } 336 }
337 con->out_keepalive_pending = false;
334 con->in_seq = 0; 338 con->in_seq = 0;
339 con->in_seq_acked = 0;
335} 340}
336 341
337/* 342/*
@@ -347,6 +352,7 @@ void ceph_con_close(struct ceph_connection *con)
347 clear_bit(WRITE_PENDING, &con->state); 352 clear_bit(WRITE_PENDING, &con->state);
348 mutex_lock(&con->mutex); 353 mutex_lock(&con->mutex);
349 reset_connection(con); 354 reset_connection(con);
355 con->peer_global_seq = 0;
350 cancel_delayed_work(&con->work); 356 cancel_delayed_work(&con->work);
351 mutex_unlock(&con->mutex); 357 mutex_unlock(&con->mutex);
352 queue_con(con); 358 queue_con(con);
@@ -366,6 +372,14 @@ void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr)
366} 372}
367 373
368/* 374/*
375 * return true if this connection ever successfully opened
376 */
377bool ceph_con_opened(struct ceph_connection *con)
378{
379 return con->connect_seq > 0;
380}
381
382/*
369 * generic get/put 383 * generic get/put
370 */ 384 */
371struct ceph_connection *ceph_con_get(struct ceph_connection *con) 385struct ceph_connection *ceph_con_get(struct ceph_connection *con)
@@ -474,7 +488,14 @@ static void prepare_write_message(struct ceph_connection *con)
474 list_move_tail(&m->list_head, &con->out_sent); 488 list_move_tail(&m->list_head, &con->out_sent);
475 } 489 }
476 490
477 m->hdr.seq = cpu_to_le64(++con->out_seq); 491 /*
492 * only assign outgoing seq # if we haven't sent this message
493 * yet. if it is requeued, resend with it's original seq.
494 */
495 if (m->needs_out_seq) {
496 m->hdr.seq = cpu_to_le64(++con->out_seq);
497 m->needs_out_seq = false;
498 }
478 499
479 dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n", 500 dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n",
480 m, con->out_seq, le16_to_cpu(m->hdr.type), 501 m, con->out_seq, le16_to_cpu(m->hdr.type),
@@ -636,7 +657,7 @@ static void prepare_write_connect(struct ceph_messenger *msgr,
636 dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con, 657 dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
637 con->connect_seq, global_seq, proto); 658 con->connect_seq, global_seq, proto);
638 659
639 con->out_connect.features = CEPH_FEATURE_SUPPORTED; 660 con->out_connect.features = CEPH_FEATURE_SUPPORTED_CLIENT;
640 con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT); 661 con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
641 con->out_connect.connect_seq = cpu_to_le32(con->connect_seq); 662 con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
642 con->out_connect.global_seq = cpu_to_le32(global_seq); 663 con->out_connect.global_seq = cpu_to_le32(global_seq);
@@ -830,13 +851,6 @@ static void prepare_read_connect(struct ceph_connection *con)
830 con->in_base_pos = 0; 851 con->in_base_pos = 0;
831} 852}
832 853
833static void prepare_read_connect_retry(struct ceph_connection *con)
834{
835 dout("prepare_read_connect_retry %p\n", con);
836 con->in_base_pos = strlen(CEPH_BANNER) + sizeof(con->actual_peer_addr)
837 + sizeof(con->peer_addr_for_me);
838}
839
840static void prepare_read_ack(struct ceph_connection *con) 854static void prepare_read_ack(struct ceph_connection *con)
841{ 855{
842 dout("prepare_read_ack %p\n", con); 856 dout("prepare_read_ack %p\n", con);
@@ -1106,8 +1120,8 @@ static void fail_protocol(struct ceph_connection *con)
1106 1120
1107static int process_connect(struct ceph_connection *con) 1121static int process_connect(struct ceph_connection *con)
1108{ 1122{
1109 u64 sup_feat = CEPH_FEATURE_SUPPORTED; 1123 u64 sup_feat = CEPH_FEATURE_SUPPORTED_CLIENT;
1110 u64 req_feat = CEPH_FEATURE_REQUIRED; 1124 u64 req_feat = CEPH_FEATURE_REQUIRED_CLIENT;
1111 u64 server_feat = le64_to_cpu(con->in_reply.features); 1125 u64 server_feat = le64_to_cpu(con->in_reply.features);
1112 1126
1113 dout("process_connect on %p tag %d\n", con, (int)con->in_tag); 1127 dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
@@ -1146,7 +1160,7 @@ static int process_connect(struct ceph_connection *con)
1146 } 1160 }
1147 con->auth_retry = 1; 1161 con->auth_retry = 1;
1148 prepare_write_connect(con->msgr, con, 0); 1162 prepare_write_connect(con->msgr, con, 0);
1149 prepare_read_connect_retry(con); 1163 prepare_read_connect(con);
1150 break; 1164 break;
1151 1165
1152 case CEPH_MSGR_TAG_RESETSESSION: 1166 case CEPH_MSGR_TAG_RESETSESSION:
@@ -1215,6 +1229,7 @@ static int process_connect(struct ceph_connection *con)
1215 clear_bit(CONNECTING, &con->state); 1229 clear_bit(CONNECTING, &con->state);
1216 con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq); 1230 con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
1217 con->connect_seq++; 1231 con->connect_seq++;
1232 con->peer_features = server_feat;
1218 dout("process_connect got READY gseq %d cseq %d (%d)\n", 1233 dout("process_connect got READY gseq %d cseq %d (%d)\n",
1219 con->peer_global_seq, 1234 con->peer_global_seq,
1220 le32_to_cpu(con->in_reply.connect_seq), 1235 le32_to_cpu(con->in_reply.connect_seq),
@@ -1323,6 +1338,7 @@ static int read_partial_message(struct ceph_connection *con)
1323 unsigned front_len, middle_len, data_len, data_off; 1338 unsigned front_len, middle_len, data_len, data_off;
1324 int datacrc = con->msgr->nocrc; 1339 int datacrc = con->msgr->nocrc;
1325 int skip; 1340 int skip;
1341 u64 seq;
1326 1342
1327 dout("read_partial_message con %p msg %p\n", con, m); 1343 dout("read_partial_message con %p msg %p\n", con, m);
1328 1344
@@ -1357,6 +1373,25 @@ static int read_partial_message(struct ceph_connection *con)
1357 return -EIO; 1373 return -EIO;
1358 data_off = le16_to_cpu(con->in_hdr.data_off); 1374 data_off = le16_to_cpu(con->in_hdr.data_off);
1359 1375
1376 /* verify seq# */
1377 seq = le64_to_cpu(con->in_hdr.seq);
1378 if ((s64)seq - (s64)con->in_seq < 1) {
1379 pr_info("skipping %s%lld %s seq %lld, expected %lld\n",
1380 ENTITY_NAME(con->peer_name),
1381 pr_addr(&con->peer_addr.in_addr),
1382 seq, con->in_seq + 1);
1383 con->in_base_pos = -front_len - middle_len - data_len -
1384 sizeof(m->footer);
1385 con->in_tag = CEPH_MSGR_TAG_READY;
1386 con->in_seq++;
1387 return 0;
1388 } else if ((s64)seq - (s64)con->in_seq > 1) {
1389 pr_err("read_partial_message bad seq %lld expected %lld\n",
1390 seq, con->in_seq + 1);
1391 con->error_msg = "bad message sequence # for incoming message";
1392 return -EBADMSG;
1393 }
1394
1360 /* allocate message? */ 1395 /* allocate message? */
1361 if (!con->in_msg) { 1396 if (!con->in_msg) {
1362 dout("got hdr type %d front %d data %d\n", con->in_hdr.type, 1397 dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
@@ -1364,18 +1399,17 @@ static int read_partial_message(struct ceph_connection *con)
1364 con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip); 1399 con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip);
1365 if (skip) { 1400 if (skip) {
1366 /* skip this message */ 1401 /* skip this message */
1367 dout("alloc_msg returned NULL, skipping message\n"); 1402 dout("alloc_msg said skip message\n");
1368 con->in_base_pos = -front_len - middle_len - data_len - 1403 con->in_base_pos = -front_len - middle_len - data_len -
1369 sizeof(m->footer); 1404 sizeof(m->footer);
1370 con->in_tag = CEPH_MSGR_TAG_READY; 1405 con->in_tag = CEPH_MSGR_TAG_READY;
1406 con->in_seq++;
1371 return 0; 1407 return 0;
1372 } 1408 }
1373 if (IS_ERR(con->in_msg)) { 1409 if (!con->in_msg) {
1374 ret = PTR_ERR(con->in_msg);
1375 con->in_msg = NULL;
1376 con->error_msg = 1410 con->error_msg =
1377 "error allocating memory for incoming message"; 1411 "error allocating memory for incoming message";
1378 return ret; 1412 return -ENOMEM;
1379 } 1413 }
1380 m = con->in_msg; 1414 m = con->in_msg;
1381 m->front.iov_len = 0; /* haven't read it yet */ 1415 m->front.iov_len = 0; /* haven't read it yet */
@@ -1475,14 +1509,14 @@ static void process_message(struct ceph_connection *con)
1475 1509
1476 /* if first message, set peer_name */ 1510 /* if first message, set peer_name */
1477 if (con->peer_name.type == 0) 1511 if (con->peer_name.type == 0)
1478 con->peer_name = msg->hdr.src.name; 1512 con->peer_name = msg->hdr.src;
1479 1513
1480 con->in_seq++; 1514 con->in_seq++;
1481 mutex_unlock(&con->mutex); 1515 mutex_unlock(&con->mutex);
1482 1516
1483 dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n", 1517 dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n",
1484 msg, le64_to_cpu(msg->hdr.seq), 1518 msg, le64_to_cpu(msg->hdr.seq),
1485 ENTITY_NAME(msg->hdr.src.name), 1519 ENTITY_NAME(msg->hdr.src),
1486 le16_to_cpu(msg->hdr.type), 1520 le16_to_cpu(msg->hdr.type),
1487 ceph_msg_type_name(le16_to_cpu(msg->hdr.type)), 1521 ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
1488 le32_to_cpu(msg->hdr.front_len), 1522 le32_to_cpu(msg->hdr.front_len),
@@ -1507,7 +1541,6 @@ static int try_write(struct ceph_connection *con)
1507 dout("try_write start %p state %lu nref %d\n", con, con->state, 1541 dout("try_write start %p state %lu nref %d\n", con, con->state,
1508 atomic_read(&con->nref)); 1542 atomic_read(&con->nref));
1509 1543
1510 mutex_lock(&con->mutex);
1511more: 1544more:
1512 dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes); 1545 dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
1513 1546
@@ -1600,7 +1633,6 @@ do_next:
1600done: 1633done:
1601 ret = 0; 1634 ret = 0;
1602out: 1635out:
1603 mutex_unlock(&con->mutex);
1604 dout("try_write done on %p\n", con); 1636 dout("try_write done on %p\n", con);
1605 return ret; 1637 return ret;
1606} 1638}
@@ -1612,7 +1644,6 @@ out:
1612 */ 1644 */
1613static int try_read(struct ceph_connection *con) 1645static int try_read(struct ceph_connection *con)
1614{ 1646{
1615 struct ceph_messenger *msgr;
1616 int ret = -1; 1647 int ret = -1;
1617 1648
1618 if (!con->sock) 1649 if (!con->sock)
@@ -1622,9 +1653,6 @@ static int try_read(struct ceph_connection *con)
1622 return 0; 1653 return 0;
1623 1654
1624 dout("try_read start on %p\n", con); 1655 dout("try_read start on %p\n", con);
1625 msgr = con->msgr;
1626
1627 mutex_lock(&con->mutex);
1628 1656
1629more: 1657more:
1630 dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag, 1658 dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
@@ -1719,7 +1747,6 @@ more:
1719done: 1747done:
1720 ret = 0; 1748 ret = 0;
1721out: 1749out:
1722 mutex_unlock(&con->mutex);
1723 dout("try_read done on %p\n", con); 1750 dout("try_read done on %p\n", con);
1724 return ret; 1751 return ret;
1725 1752
@@ -1791,6 +1818,8 @@ more:
1791 dout("con_work %p start, clearing QUEUED\n", con); 1818 dout("con_work %p start, clearing QUEUED\n", con);
1792 clear_bit(QUEUED, &con->state); 1819 clear_bit(QUEUED, &con->state);
1793 1820
1821 mutex_lock(&con->mutex);
1822
1794 if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */ 1823 if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */
1795 dout("con_work CLOSED\n"); 1824 dout("con_work CLOSED\n");
1796 con_close_socket(con); 1825 con_close_socket(con);
@@ -1805,11 +1834,16 @@ more:
1805 if (test_and_clear_bit(SOCK_CLOSED, &con->state) || 1834 if (test_and_clear_bit(SOCK_CLOSED, &con->state) ||
1806 try_read(con) < 0 || 1835 try_read(con) < 0 ||
1807 try_write(con) < 0) { 1836 try_write(con) < 0) {
1837 mutex_unlock(&con->mutex);
1808 backoff = 1; 1838 backoff = 1;
1809 ceph_fault(con); /* error/fault path */ 1839 ceph_fault(con); /* error/fault path */
1840 goto done_unlocked;
1810 } 1841 }
1811 1842
1812done: 1843done:
1844 mutex_unlock(&con->mutex);
1845
1846done_unlocked:
1813 clear_bit(BUSY, &con->state); 1847 clear_bit(BUSY, &con->state);
1814 dout("con->state=%lu\n", con->state); 1848 dout("con->state=%lu\n", con->state);
1815 if (test_bit(QUEUED, &con->state)) { 1849 if (test_bit(QUEUED, &con->state)) {
@@ -1843,8 +1877,6 @@ static void ceph_fault(struct ceph_connection *con)
1843 goto out; 1877 goto out;
1844 } 1878 }
1845 1879
1846 clear_bit(BUSY, &con->state); /* to avoid an improbable race */
1847
1848 mutex_lock(&con->mutex); 1880 mutex_lock(&con->mutex);
1849 if (test_bit(CLOSED, &con->state)) 1881 if (test_bit(CLOSED, &con->state))
1850 goto out_unlock; 1882 goto out_unlock;
@@ -1910,7 +1942,7 @@ struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr)
1910 1942
1911 /* the zero page is needed if a request is "canceled" while the message 1943 /* the zero page is needed if a request is "canceled" while the message
1912 * is being written over the socket */ 1944 * is being written over the socket */
1913 msgr->zero_page = alloc_page(GFP_KERNEL | __GFP_ZERO); 1945 msgr->zero_page = __page_cache_alloc(GFP_KERNEL | __GFP_ZERO);
1914 if (!msgr->zero_page) { 1946 if (!msgr->zero_page) {
1915 kfree(msgr); 1947 kfree(msgr);
1916 return ERR_PTR(-ENOMEM); 1948 return ERR_PTR(-ENOMEM);
@@ -1950,12 +1982,12 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
1950 } 1982 }
1951 1983
1952 /* set src+dst */ 1984 /* set src+dst */
1953 msg->hdr.src.name = con->msgr->inst.name; 1985 msg->hdr.src = con->msgr->inst.name;
1954 msg->hdr.src.addr = con->msgr->my_enc_addr;
1955 msg->hdr.orig_src = msg->hdr.src;
1956 1986
1957 BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len)); 1987 BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
1958 1988
1989 msg->needs_out_seq = true;
1990
1959 /* queue */ 1991 /* queue */
1960 mutex_lock(&con->mutex); 1992 mutex_lock(&con->mutex);
1961 BUG_ON(!list_empty(&msg->list_head)); 1993 BUG_ON(!list_empty(&msg->list_head));
@@ -2021,6 +2053,7 @@ void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg)
2021 ceph_msg_put(con->in_msg); 2053 ceph_msg_put(con->in_msg);
2022 con->in_msg = NULL; 2054 con->in_msg = NULL;
2023 con->in_tag = CEPH_MSGR_TAG_READY; 2055 con->in_tag = CEPH_MSGR_TAG_READY;
2056 con->in_seq++;
2024 } else { 2057 } else {
2025 dout("con_revoke_pages %p msg %p pages %p no-op\n", 2058 dout("con_revoke_pages %p msg %p pages %p no-op\n",
2026 con, con->in_msg, msg); 2059 con, con->in_msg, msg);
@@ -2043,26 +2076,29 @@ void ceph_con_keepalive(struct ceph_connection *con)
2043 * construct a new message with given type, size 2076 * construct a new message with given type, size
2044 * the new msg has a ref count of 1. 2077 * the new msg has a ref count of 1.
2045 */ 2078 */
2046struct ceph_msg *ceph_msg_new(int type, int front_len, 2079struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags)
2047 int page_len, int page_off, struct page **pages)
2048{ 2080{
2049 struct ceph_msg *m; 2081 struct ceph_msg *m;
2050 2082
2051 m = kmalloc(sizeof(*m), GFP_NOFS); 2083 m = kmalloc(sizeof(*m), flags);
2052 if (m == NULL) 2084 if (m == NULL)
2053 goto out; 2085 goto out;
2054 kref_init(&m->kref); 2086 kref_init(&m->kref);
2055 INIT_LIST_HEAD(&m->list_head); 2087 INIT_LIST_HEAD(&m->list_head);
2056 2088
2089 m->hdr.tid = 0;
2057 m->hdr.type = cpu_to_le16(type); 2090 m->hdr.type = cpu_to_le16(type);
2091 m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT);
2092 m->hdr.version = 0;
2058 m->hdr.front_len = cpu_to_le32(front_len); 2093 m->hdr.front_len = cpu_to_le32(front_len);
2059 m->hdr.middle_len = 0; 2094 m->hdr.middle_len = 0;
2060 m->hdr.data_len = cpu_to_le32(page_len); 2095 m->hdr.data_len = 0;
2061 m->hdr.data_off = cpu_to_le16(page_off); 2096 m->hdr.data_off = 0;
2062 m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT); 2097 m->hdr.reserved = 0;
2063 m->footer.front_crc = 0; 2098 m->footer.front_crc = 0;
2064 m->footer.middle_crc = 0; 2099 m->footer.middle_crc = 0;
2065 m->footer.data_crc = 0; 2100 m->footer.data_crc = 0;
2101 m->footer.flags = 0;
2066 m->front_max = front_len; 2102 m->front_max = front_len;
2067 m->front_is_vmalloc = false; 2103 m->front_is_vmalloc = false;
2068 m->more_to_follow = false; 2104 m->more_to_follow = false;
@@ -2071,11 +2107,11 @@ struct ceph_msg *ceph_msg_new(int type, int front_len,
2071 /* front */ 2107 /* front */
2072 if (front_len) { 2108 if (front_len) {
2073 if (front_len > PAGE_CACHE_SIZE) { 2109 if (front_len > PAGE_CACHE_SIZE) {
2074 m->front.iov_base = __vmalloc(front_len, GFP_NOFS, 2110 m->front.iov_base = __vmalloc(front_len, flags,
2075 PAGE_KERNEL); 2111 PAGE_KERNEL);
2076 m->front_is_vmalloc = true; 2112 m->front_is_vmalloc = true;
2077 } else { 2113 } else {
2078 m->front.iov_base = kmalloc(front_len, GFP_NOFS); 2114 m->front.iov_base = kmalloc(front_len, flags);
2079 } 2115 }
2080 if (m->front.iov_base == NULL) { 2116 if (m->front.iov_base == NULL) {
2081 pr_err("msg_new can't allocate %d bytes\n", 2117 pr_err("msg_new can't allocate %d bytes\n",
@@ -2091,19 +2127,18 @@ struct ceph_msg *ceph_msg_new(int type, int front_len,
2091 m->middle = NULL; 2127 m->middle = NULL;
2092 2128
2093 /* data */ 2129 /* data */
2094 m->nr_pages = calc_pages_for(page_off, page_len); 2130 m->nr_pages = 0;
2095 m->pages = pages; 2131 m->pages = NULL;
2096 m->pagelist = NULL; 2132 m->pagelist = NULL;
2097 2133
2098 dout("ceph_msg_new %p page %d~%d -> %d\n", m, page_off, page_len, 2134 dout("ceph_msg_new %p front %d\n", m, front_len);
2099 m->nr_pages);
2100 return m; 2135 return m;
2101 2136
2102out2: 2137out2:
2103 ceph_msg_put(m); 2138 ceph_msg_put(m);
2104out: 2139out:
2105 pr_err("msg_new can't create type %d len %d\n", type, front_len); 2140 pr_err("msg_new can't create type %d front %d\n", type, front_len);
2106 return ERR_PTR(-ENOMEM); 2141 return NULL;
2107} 2142}
2108 2143
2109/* 2144/*
@@ -2146,29 +2181,25 @@ static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
2146 mutex_unlock(&con->mutex); 2181 mutex_unlock(&con->mutex);
2147 msg = con->ops->alloc_msg(con, hdr, skip); 2182 msg = con->ops->alloc_msg(con, hdr, skip);
2148 mutex_lock(&con->mutex); 2183 mutex_lock(&con->mutex);
2149 if (IS_ERR(msg)) 2184 if (!msg || *skip)
2150 return msg;
2151
2152 if (*skip)
2153 return NULL; 2185 return NULL;
2154 } 2186 }
2155 if (!msg) { 2187 if (!msg) {
2156 *skip = 0; 2188 *skip = 0;
2157 msg = ceph_msg_new(type, front_len, 0, 0, NULL); 2189 msg = ceph_msg_new(type, front_len, GFP_NOFS);
2158 if (!msg) { 2190 if (!msg) {
2159 pr_err("unable to allocate msg type %d len %d\n", 2191 pr_err("unable to allocate msg type %d len %d\n",
2160 type, front_len); 2192 type, front_len);
2161 return ERR_PTR(-ENOMEM); 2193 return NULL;
2162 } 2194 }
2163 } 2195 }
2164 memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr)); 2196 memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
2165 2197
2166 if (middle_len) { 2198 if (middle_len && !msg->middle) {
2167 ret = ceph_alloc_middle(con, msg); 2199 ret = ceph_alloc_middle(con, msg);
2168
2169 if (ret < 0) { 2200 if (ret < 0) {
2170 ceph_msg_put(msg); 2201 ceph_msg_put(msg);
2171 return msg; 2202 return NULL;
2172 } 2203 }
2173 } 2204 }
2174 2205
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
index 4caaa5911110..76fbc957bc13 100644
--- a/fs/ceph/messenger.h
+++ b/fs/ceph/messenger.h
@@ -49,10 +49,8 @@ struct ceph_connection_operations {
49 int *skip); 49 int *skip);
50}; 50};
51 51
52extern const char *ceph_name_type_str(int t);
53
54/* use format string %s%d */ 52/* use format string %s%d */
55#define ENTITY_NAME(n) ceph_name_type_str((n).type), le64_to_cpu((n).num) 53#define ENTITY_NAME(n) ceph_entity_type_name((n).type), le64_to_cpu((n).num)
56 54
57struct ceph_messenger { 55struct ceph_messenger {
58 struct ceph_entity_inst inst; /* my name+address */ 56 struct ceph_entity_inst inst; /* my name+address */
@@ -86,6 +84,7 @@ struct ceph_msg {
86 struct kref kref; 84 struct kref kref;
87 bool front_is_vmalloc; 85 bool front_is_vmalloc;
88 bool more_to_follow; 86 bool more_to_follow;
87 bool needs_out_seq;
89 int front_max; 88 int front_max;
90 89
91 struct ceph_msgpool *pool; 90 struct ceph_msgpool *pool;
@@ -143,6 +142,7 @@ struct ceph_connection {
143 struct ceph_entity_addr peer_addr; /* peer address */ 142 struct ceph_entity_addr peer_addr; /* peer address */
144 struct ceph_entity_name peer_name; /* peer name */ 143 struct ceph_entity_name peer_name; /* peer name */
145 struct ceph_entity_addr peer_addr_for_me; 144 struct ceph_entity_addr peer_addr_for_me;
145 unsigned peer_features;
146 u32 connect_seq; /* identify the most recent connection 146 u32 connect_seq; /* identify the most recent connection
147 attempt for this connection, client */ 147 attempt for this connection, client */
148 u32 peer_global_seq; /* peer's global seq for this connection */ 148 u32 peer_global_seq; /* peer's global seq for this connection */
@@ -157,7 +157,6 @@ struct ceph_connection {
157 struct list_head out_queue; 157 struct list_head out_queue;
158 struct list_head out_sent; /* sending or sent but unacked */ 158 struct list_head out_sent; /* sending or sent but unacked */
159 u64 out_seq; /* last message queued for send */ 159 u64 out_seq; /* last message queued for send */
160 u64 out_seq_sent; /* last message sent */
161 bool out_keepalive_pending; 160 bool out_keepalive_pending;
162 161
163 u64 in_seq, in_seq_acked; /* last message received, acked */ 162 u64 in_seq, in_seq_acked; /* last message received, acked */
@@ -214,6 +213,7 @@ extern int ceph_parse_ips(const char *c, const char *end,
214 213
215extern int ceph_msgr_init(void); 214extern int ceph_msgr_init(void);
216extern void ceph_msgr_exit(void); 215extern void ceph_msgr_exit(void);
216extern void ceph_msgr_flush(void);
217 217
218extern struct ceph_messenger *ceph_messenger_create( 218extern struct ceph_messenger *ceph_messenger_create(
219 struct ceph_entity_addr *myaddr); 219 struct ceph_entity_addr *myaddr);
@@ -223,6 +223,7 @@ extern void ceph_con_init(struct ceph_messenger *msgr,
223 struct ceph_connection *con); 223 struct ceph_connection *con);
224extern void ceph_con_open(struct ceph_connection *con, 224extern void ceph_con_open(struct ceph_connection *con,
225 struct ceph_entity_addr *addr); 225 struct ceph_entity_addr *addr);
226extern bool ceph_con_opened(struct ceph_connection *con);
226extern void ceph_con_close(struct ceph_connection *con); 227extern void ceph_con_close(struct ceph_connection *con);
227extern void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg); 228extern void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg);
228extern void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg); 229extern void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg);
@@ -232,9 +233,7 @@ extern void ceph_con_keepalive(struct ceph_connection *con);
232extern struct ceph_connection *ceph_con_get(struct ceph_connection *con); 233extern struct ceph_connection *ceph_con_get(struct ceph_connection *con);
233extern void ceph_con_put(struct ceph_connection *con); 234extern void ceph_con_put(struct ceph_connection *con);
234 235
235extern struct ceph_msg *ceph_msg_new(int type, int front_len, 236extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags);
236 int page_len, int page_off,
237 struct page **pages);
238extern void ceph_msg_kfree(struct ceph_msg *m); 237extern void ceph_msg_kfree(struct ceph_msg *m);
239 238
240 239
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
index 890597c09d43..21c62e9b7d1d 100644
--- a/fs/ceph/mon_client.c
+++ b/fs/ceph/mon_client.c
@@ -1,6 +1,7 @@
1#include "ceph_debug.h" 1#include "ceph_debug.h"
2 2
3#include <linux/types.h> 3#include <linux/types.h>
4#include <linux/slab.h>
4#include <linux/random.h> 5#include <linux/random.h>
5#include <linux/sched.h> 6#include <linux/sched.h>
6 7
@@ -27,7 +28,7 @@
27 * resend any outstanding requests. 28 * resend any outstanding requests.
28 */ 29 */
29 30
30const static struct ceph_connection_operations mon_con_ops; 31static const struct ceph_connection_operations mon_con_ops;
31 32
32static int __validate_auth(struct ceph_mon_client *monc); 33static int __validate_auth(struct ceph_mon_client *monc);
33 34
@@ -103,6 +104,7 @@ static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
103 monc->pending_auth = 1; 104 monc->pending_auth = 1;
104 monc->m_auth->front.iov_len = len; 105 monc->m_auth->front.iov_len = len;
105 monc->m_auth->hdr.front_len = cpu_to_le32(len); 106 monc->m_auth->hdr.front_len = cpu_to_le32(len);
107 ceph_con_revoke(monc->con, monc->m_auth);
106 ceph_msg_get(monc->m_auth); /* keep our ref */ 108 ceph_msg_get(monc->m_auth); /* keep our ref */
107 ceph_con_send(monc->con, monc->m_auth); 109 ceph_con_send(monc->con, monc->m_auth);
108} 110}
@@ -186,16 +188,12 @@ static void __send_subscribe(struct ceph_mon_client *monc)
186 monc->want_next_osdmap); 188 monc->want_next_osdmap);
187 if ((__sub_expired(monc) && !monc->sub_sent) || 189 if ((__sub_expired(monc) && !monc->sub_sent) ||
188 monc->want_next_osdmap == 1) { 190 monc->want_next_osdmap == 1) {
189 struct ceph_msg *msg; 191 struct ceph_msg *msg = monc->m_subscribe;
190 struct ceph_mon_subscribe_item *i; 192 struct ceph_mon_subscribe_item *i;
191 void *p, *end; 193 void *p, *end;
192 194
193 msg = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, 0, 0, NULL);
194 if (!msg)
195 return;
196
197 p = msg->front.iov_base; 195 p = msg->front.iov_base;
198 end = p + msg->front.iov_len; 196 end = p + msg->front_max;
199 197
200 dout("__send_subscribe to 'mdsmap' %u+\n", 198 dout("__send_subscribe to 'mdsmap' %u+\n",
201 (unsigned)monc->have_mdsmap); 199 (unsigned)monc->have_mdsmap);
@@ -225,7 +223,8 @@ static void __send_subscribe(struct ceph_mon_client *monc)
225 223
226 msg->front.iov_len = p - msg->front.iov_base; 224 msg->front.iov_len = p - msg->front.iov_base;
227 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); 225 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
228 ceph_con_send(monc->con, msg); 226 ceph_con_revoke(monc->con, msg);
227 ceph_con_send(monc->con, ceph_msg_get(msg));
229 228
230 monc->sub_sent = jiffies | 1; /* never 0 */ 229 monc->sub_sent = jiffies | 1; /* never 0 */
231 } 230 }
@@ -352,14 +351,14 @@ out:
352/* 351/*
353 * statfs 352 * statfs
354 */ 353 */
355static struct ceph_mon_statfs_request *__lookup_statfs( 354static struct ceph_mon_generic_request *__lookup_generic_req(
356 struct ceph_mon_client *monc, u64 tid) 355 struct ceph_mon_client *monc, u64 tid)
357{ 356{
358 struct ceph_mon_statfs_request *req; 357 struct ceph_mon_generic_request *req;
359 struct rb_node *n = monc->statfs_request_tree.rb_node; 358 struct rb_node *n = monc->generic_request_tree.rb_node;
360 359
361 while (n) { 360 while (n) {
362 req = rb_entry(n, struct ceph_mon_statfs_request, node); 361 req = rb_entry(n, struct ceph_mon_generic_request, node);
363 if (tid < req->tid) 362 if (tid < req->tid)
364 n = n->rb_left; 363 n = n->rb_left;
365 else if (tid > req->tid) 364 else if (tid > req->tid)
@@ -370,16 +369,16 @@ static struct ceph_mon_statfs_request *__lookup_statfs(
370 return NULL; 369 return NULL;
371} 370}
372 371
373static void __insert_statfs(struct ceph_mon_client *monc, 372static void __insert_generic_request(struct ceph_mon_client *monc,
374 struct ceph_mon_statfs_request *new) 373 struct ceph_mon_generic_request *new)
375{ 374{
376 struct rb_node **p = &monc->statfs_request_tree.rb_node; 375 struct rb_node **p = &monc->generic_request_tree.rb_node;
377 struct rb_node *parent = NULL; 376 struct rb_node *parent = NULL;
378 struct ceph_mon_statfs_request *req = NULL; 377 struct ceph_mon_generic_request *req = NULL;
379 378
380 while (*p) { 379 while (*p) {
381 parent = *p; 380 parent = *p;
382 req = rb_entry(parent, struct ceph_mon_statfs_request, node); 381 req = rb_entry(parent, struct ceph_mon_generic_request, node);
383 if (new->tid < req->tid) 382 if (new->tid < req->tid)
384 p = &(*p)->rb_left; 383 p = &(*p)->rb_left;
385 else if (new->tid > req->tid) 384 else if (new->tid > req->tid)
@@ -389,113 +388,157 @@ static void __insert_statfs(struct ceph_mon_client *monc,
389 } 388 }
390 389
391 rb_link_node(&new->node, parent, p); 390 rb_link_node(&new->node, parent, p);
392 rb_insert_color(&new->node, &monc->statfs_request_tree); 391 rb_insert_color(&new->node, &monc->generic_request_tree);
392}
393
394static void release_generic_request(struct kref *kref)
395{
396 struct ceph_mon_generic_request *req =
397 container_of(kref, struct ceph_mon_generic_request, kref);
398
399 if (req->reply)
400 ceph_msg_put(req->reply);
401 if (req->request)
402 ceph_msg_put(req->request);
403}
404
405static void put_generic_request(struct ceph_mon_generic_request *req)
406{
407 kref_put(&req->kref, release_generic_request);
408}
409
410static void get_generic_request(struct ceph_mon_generic_request *req)
411{
412 kref_get(&req->kref);
413}
414
415static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
416 struct ceph_msg_header *hdr,
417 int *skip)
418{
419 struct ceph_mon_client *monc = con->private;
420 struct ceph_mon_generic_request *req;
421 u64 tid = le64_to_cpu(hdr->tid);
422 struct ceph_msg *m;
423
424 mutex_lock(&monc->mutex);
425 req = __lookup_generic_req(monc, tid);
426 if (!req) {
427 dout("get_generic_reply %lld dne\n", tid);
428 *skip = 1;
429 m = NULL;
430 } else {
431 dout("get_generic_reply %lld got %p\n", tid, req->reply);
432 m = ceph_msg_get(req->reply);
433 /*
434 * we don't need to track the connection reading into
435 * this reply because we only have one open connection
436 * at a time, ever.
437 */
438 }
439 mutex_unlock(&monc->mutex);
440 return m;
393} 441}
394 442
395static void handle_statfs_reply(struct ceph_mon_client *monc, 443static void handle_statfs_reply(struct ceph_mon_client *monc,
396 struct ceph_msg *msg) 444 struct ceph_msg *msg)
397{ 445{
398 struct ceph_mon_statfs_request *req; 446 struct ceph_mon_generic_request *req;
399 struct ceph_mon_statfs_reply *reply = msg->front.iov_base; 447 struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
400 u64 tid; 448 u64 tid = le64_to_cpu(msg->hdr.tid);
401 449
402 if (msg->front.iov_len != sizeof(*reply)) 450 if (msg->front.iov_len != sizeof(*reply))
403 goto bad; 451 goto bad;
404 tid = le64_to_cpu(msg->hdr.tid);
405 dout("handle_statfs_reply %p tid %llu\n", msg, tid); 452 dout("handle_statfs_reply %p tid %llu\n", msg, tid);
406 453
407 mutex_lock(&monc->mutex); 454 mutex_lock(&monc->mutex);
408 req = __lookup_statfs(monc, tid); 455 req = __lookup_generic_req(monc, tid);
409 if (req) { 456 if (req) {
410 *req->buf = reply->st; 457 *(struct ceph_statfs *)req->buf = reply->st;
411 req->result = 0; 458 req->result = 0;
459 get_generic_request(req);
412 } 460 }
413 mutex_unlock(&monc->mutex); 461 mutex_unlock(&monc->mutex);
414 if (req) 462 if (req) {
415 complete(&req->completion); 463 complete(&req->completion);
464 put_generic_request(req);
465 }
416 return; 466 return;
417 467
418bad: 468bad:
419 pr_err("corrupt statfs reply, no tid\n"); 469 pr_err("corrupt generic reply, no tid\n");
420 ceph_msg_dump(msg); 470 ceph_msg_dump(msg);
421} 471}
422 472
423/* 473/*
424 * (re)send a statfs request 474 * Do a synchronous statfs().
425 */ 475 */
426static int send_statfs(struct ceph_mon_client *monc, 476int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
427 struct ceph_mon_statfs_request *req)
428{ 477{
429 struct ceph_msg *msg; 478 struct ceph_mon_generic_request *req;
430 struct ceph_mon_statfs *h; 479 struct ceph_mon_statfs *h;
480 int err;
431 481
432 dout("send_statfs tid %llu\n", req->tid); 482 req = kzalloc(sizeof(*req), GFP_NOFS);
433 msg = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), 0, 0, NULL); 483 if (!req)
434 if (IS_ERR(msg)) 484 return -ENOMEM;
435 return PTR_ERR(msg); 485
436 req->request = msg; 486 kref_init(&req->kref);
437 msg->hdr.tid = cpu_to_le64(req->tid); 487 req->buf = buf;
438 h = msg->front.iov_base; 488 init_completion(&req->completion);
489
490 err = -ENOMEM;
491 req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS);
492 if (!req->request)
493 goto out;
494 req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS);
495 if (!req->reply)
496 goto out;
497
498 /* fill out request */
499 h = req->request->front.iov_base;
439 h->monhdr.have_version = 0; 500 h->monhdr.have_version = 0;
440 h->monhdr.session_mon = cpu_to_le16(-1); 501 h->monhdr.session_mon = cpu_to_le16(-1);
441 h->monhdr.session_mon_tid = 0; 502 h->monhdr.session_mon_tid = 0;
442 h->fsid = monc->monmap->fsid; 503 h->fsid = monc->monmap->fsid;
443 ceph_con_send(monc->con, msg);
444 return 0;
445}
446
447/*
448 * Do a synchronous statfs().
449 */
450int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
451{
452 struct ceph_mon_statfs_request req;
453 int err;
454
455 req.buf = buf;
456 init_completion(&req.completion);
457
458 /* allocate memory for reply */
459 err = ceph_msgpool_resv(&monc->msgpool_statfs_reply, 1);
460 if (err)
461 return err;
462 504
463 /* register request */ 505 /* register request */
464 mutex_lock(&monc->mutex); 506 mutex_lock(&monc->mutex);
465 req.tid = ++monc->last_tid; 507 req->tid = ++monc->last_tid;
466 req.last_attempt = jiffies; 508 req->request->hdr.tid = cpu_to_le64(req->tid);
467 req.delay = BASE_DELAY_INTERVAL; 509 __insert_generic_request(monc, req);
468 __insert_statfs(monc, &req); 510 monc->num_generic_requests++;
469 monc->num_statfs_requests++;
470 mutex_unlock(&monc->mutex); 511 mutex_unlock(&monc->mutex);
471 512
472 /* send request and wait */ 513 /* send request and wait */
473 err = send_statfs(monc, &req); 514 ceph_con_send(monc->con, ceph_msg_get(req->request));
474 if (!err) 515 err = wait_for_completion_interruptible(&req->completion);
475 err = wait_for_completion_interruptible(&req.completion);
476 516
477 mutex_lock(&monc->mutex); 517 mutex_lock(&monc->mutex);
478 rb_erase(&req.node, &monc->statfs_request_tree); 518 rb_erase(&req->node, &monc->generic_request_tree);
479 monc->num_statfs_requests--; 519 monc->num_generic_requests--;
480 ceph_msgpool_resv(&monc->msgpool_statfs_reply, -1);
481 mutex_unlock(&monc->mutex); 520 mutex_unlock(&monc->mutex);
482 521
483 if (!err) 522 if (!err)
484 err = req.result; 523 err = req->result;
524
525out:
526 kref_put(&req->kref, release_generic_request);
485 return err; 527 return err;
486} 528}
487 529
488/* 530/*
489 * Resend pending statfs requests. 531 * Resend pending statfs requests.
490 */ 532 */
491static void __resend_statfs(struct ceph_mon_client *monc) 533static void __resend_generic_request(struct ceph_mon_client *monc)
492{ 534{
493 struct ceph_mon_statfs_request *req; 535 struct ceph_mon_generic_request *req;
494 struct rb_node *p; 536 struct rb_node *p;
495 537
496 for (p = rb_first(&monc->statfs_request_tree); p; p = rb_next(p)) { 538 for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) {
497 req = rb_entry(p, struct ceph_mon_statfs_request, node); 539 req = rb_entry(p, struct ceph_mon_generic_request, node);
498 send_statfs(monc, req); 540 ceph_con_revoke(monc->con, req->request);
541 ceph_con_send(monc->con, ceph_msg_get(req->request));
499 } 542 }
500} 543}
501 544
@@ -585,26 +628,26 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
585 CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON | 628 CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
586 CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS; 629 CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS;
587 630
588 /* msg pools */ 631 /* msgs */
589 err = ceph_msgpool_init(&monc->msgpool_subscribe_ack, 632 err = -ENOMEM;
590 sizeof(struct ceph_mon_subscribe_ack), 1, false); 633 monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK,
591 if (err < 0) 634 sizeof(struct ceph_mon_subscribe_ack),
635 GFP_NOFS);
636 if (!monc->m_subscribe_ack)
592 goto out_monmap; 637 goto out_monmap;
593 err = ceph_msgpool_init(&monc->msgpool_statfs_reply, 638
594 sizeof(struct ceph_mon_statfs_reply), 0, false); 639 monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS);
595 if (err < 0) 640 if (!monc->m_subscribe)
596 goto out_pool1; 641 goto out_subscribe_ack;
597 err = ceph_msgpool_init(&monc->msgpool_auth_reply, 4096, 1, false); 642
598 if (err < 0) 643 monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS);
599 goto out_pool2; 644 if (!monc->m_auth_reply)
600 645 goto out_subscribe;
601 monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, 0, 0, NULL); 646
647 monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS);
602 monc->pending_auth = 0; 648 monc->pending_auth = 0;
603 if (IS_ERR(monc->m_auth)) { 649 if (!monc->m_auth)
604 err = PTR_ERR(monc->m_auth); 650 goto out_auth_reply;
605 monc->m_auth = NULL;
606 goto out_pool3;
607 }
608 651
609 monc->cur_mon = -1; 652 monc->cur_mon = -1;
610 monc->hunting = true; 653 monc->hunting = true;
@@ -612,8 +655,8 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
612 monc->sub_sent = 0; 655 monc->sub_sent = 0;
613 656
614 INIT_DELAYED_WORK(&monc->delayed_work, delayed_work); 657 INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
615 monc->statfs_request_tree = RB_ROOT; 658 monc->generic_request_tree = RB_ROOT;
616 monc->num_statfs_requests = 0; 659 monc->num_generic_requests = 0;
617 monc->last_tid = 0; 660 monc->last_tid = 0;
618 661
619 monc->have_mdsmap = 0; 662 monc->have_mdsmap = 0;
@@ -621,12 +664,12 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
621 monc->want_next_osdmap = 1; 664 monc->want_next_osdmap = 1;
622 return 0; 665 return 0;
623 666
624out_pool3: 667out_auth_reply:
625 ceph_msgpool_destroy(&monc->msgpool_auth_reply); 668 ceph_msg_put(monc->m_auth_reply);
626out_pool2: 669out_subscribe:
627 ceph_msgpool_destroy(&monc->msgpool_subscribe_ack); 670 ceph_msg_put(monc->m_subscribe);
628out_pool1: 671out_subscribe_ack:
629 ceph_msgpool_destroy(&monc->msgpool_statfs_reply); 672 ceph_msg_put(monc->m_subscribe_ack);
630out_monmap: 673out_monmap:
631 kfree(monc->monmap); 674 kfree(monc->monmap);
632out: 675out:
@@ -650,9 +693,9 @@ void ceph_monc_stop(struct ceph_mon_client *monc)
650 ceph_auth_destroy(monc->auth); 693 ceph_auth_destroy(monc->auth);
651 694
652 ceph_msg_put(monc->m_auth); 695 ceph_msg_put(monc->m_auth);
653 ceph_msgpool_destroy(&monc->msgpool_subscribe_ack); 696 ceph_msg_put(monc->m_auth_reply);
654 ceph_msgpool_destroy(&monc->msgpool_statfs_reply); 697 ceph_msg_put(monc->m_subscribe);
655 ceph_msgpool_destroy(&monc->msgpool_auth_reply); 698 ceph_msg_put(monc->m_subscribe_ack);
656 699
657 kfree(monc->monmap); 700 kfree(monc->monmap);
658} 701}
@@ -661,8 +704,11 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
661 struct ceph_msg *msg) 704 struct ceph_msg *msg)
662{ 705{
663 int ret; 706 int ret;
707 int was_auth = 0;
664 708
665 mutex_lock(&monc->mutex); 709 mutex_lock(&monc->mutex);
710 if (monc->auth->ops)
711 was_auth = monc->auth->ops->is_authenticated(monc->auth);
666 monc->pending_auth = 0; 712 monc->pending_auth = 0;
667 ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base, 713 ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
668 msg->front.iov_len, 714 msg->front.iov_len,
@@ -673,14 +719,14 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
673 wake_up(&monc->client->auth_wq); 719 wake_up(&monc->client->auth_wq);
674 } else if (ret > 0) { 720 } else if (ret > 0) {
675 __send_prepared_auth_request(monc, ret); 721 __send_prepared_auth_request(monc, ret);
676 } else if (monc->auth->ops->is_authenticated(monc->auth)) { 722 } else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) {
677 dout("authenticated, starting session\n"); 723 dout("authenticated, starting session\n");
678 724
679 monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT; 725 monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
680 monc->client->msgr->inst.name.num = monc->auth->global_id; 726 monc->client->msgr->inst.name.num = monc->auth->global_id;
681 727
682 __send_subscribe(monc); 728 __send_subscribe(monc);
683 __resend_statfs(monc); 729 __resend_generic_request(monc);
684 } 730 }
685 mutex_unlock(&monc->mutex); 731 mutex_unlock(&monc->mutex);
686} 732}
@@ -769,18 +815,17 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
769 815
770 switch (type) { 816 switch (type) {
771 case CEPH_MSG_MON_SUBSCRIBE_ACK: 817 case CEPH_MSG_MON_SUBSCRIBE_ACK:
772 m = ceph_msgpool_get(&monc->msgpool_subscribe_ack, front_len); 818 m = ceph_msg_get(monc->m_subscribe_ack);
773 break; 819 break;
774 case CEPH_MSG_STATFS_REPLY: 820 case CEPH_MSG_STATFS_REPLY:
775 m = ceph_msgpool_get(&monc->msgpool_statfs_reply, front_len); 821 return get_generic_reply(con, hdr, skip);
776 break;
777 case CEPH_MSG_AUTH_REPLY: 822 case CEPH_MSG_AUTH_REPLY:
778 m = ceph_msgpool_get(&monc->msgpool_auth_reply, front_len); 823 m = ceph_msg_get(monc->m_auth_reply);
779 break; 824 break;
780 case CEPH_MSG_MON_MAP: 825 case CEPH_MSG_MON_MAP:
781 case CEPH_MSG_MDS_MAP: 826 case CEPH_MSG_MDS_MAP:
782 case CEPH_MSG_OSD_MAP: 827 case CEPH_MSG_OSD_MAP:
783 m = ceph_msg_new(type, front_len, 0, 0, NULL); 828 m = ceph_msg_new(type, front_len, GFP_NOFS);
784 break; 829 break;
785 } 830 }
786 831
@@ -825,7 +870,7 @@ out:
825 mutex_unlock(&monc->mutex); 870 mutex_unlock(&monc->mutex);
826} 871}
827 872
828const static struct ceph_connection_operations mon_con_ops = { 873static const struct ceph_connection_operations mon_con_ops = {
829 .get = ceph_con_get, 874 .get = ceph_con_get,
830 .put = ceph_con_put, 875 .put = ceph_con_put,
831 .dispatch = dispatch, 876 .dispatch = dispatch,
diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h
index b958ad5afa06..174d794321d0 100644
--- a/fs/ceph/mon_client.h
+++ b/fs/ceph/mon_client.h
@@ -2,10 +2,10 @@
2#define _FS_CEPH_MON_CLIENT_H 2#define _FS_CEPH_MON_CLIENT_H
3 3
4#include <linux/completion.h> 4#include <linux/completion.h>
5#include <linux/kref.h>
5#include <linux/rbtree.h> 6#include <linux/rbtree.h>
6 7
7#include "messenger.h" 8#include "messenger.h"
8#include "msgpool.h"
9 9
10struct ceph_client; 10struct ceph_client;
11struct ceph_mount_args; 11struct ceph_mount_args;
@@ -22,7 +22,7 @@ struct ceph_monmap {
22}; 22};
23 23
24struct ceph_mon_client; 24struct ceph_mon_client;
25struct ceph_mon_statfs_request; 25struct ceph_mon_generic_request;
26 26
27 27
28/* 28/*
@@ -40,17 +40,19 @@ struct ceph_mon_request {
40}; 40};
41 41
42/* 42/*
43 * statfs() is done a bit differently because we need to get data back 43 * ceph_mon_generic_request is being used for the statfs and poolop requests
44 * which are bening done a bit differently because we need to get data back
44 * to the caller 45 * to the caller
45 */ 46 */
46struct ceph_mon_statfs_request { 47struct ceph_mon_generic_request {
48 struct kref kref;
47 u64 tid; 49 u64 tid;
48 struct rb_node node; 50 struct rb_node node;
49 int result; 51 int result;
50 struct ceph_statfs *buf; 52 void *buf;
51 struct completion completion; 53 struct completion completion;
52 unsigned long last_attempt, delay; /* jiffies */
53 struct ceph_msg *request; /* original request */ 54 struct ceph_msg *request; /* original request */
55 struct ceph_msg *reply; /* and reply */
54}; 56};
55 57
56struct ceph_mon_client { 58struct ceph_mon_client {
@@ -61,7 +63,7 @@ struct ceph_mon_client {
61 struct delayed_work delayed_work; 63 struct delayed_work delayed_work;
62 64
63 struct ceph_auth_client *auth; 65 struct ceph_auth_client *auth;
64 struct ceph_msg *m_auth; 66 struct ceph_msg *m_auth, *m_auth_reply, *m_subscribe, *m_subscribe_ack;
65 int pending_auth; 67 int pending_auth;
66 68
67 bool hunting; 69 bool hunting;
@@ -70,14 +72,9 @@ struct ceph_mon_client {
70 struct ceph_connection *con; 72 struct ceph_connection *con;
71 bool have_fsid; 73 bool have_fsid;
72 74
73 /* msg pools */ 75 /* pending generic requests */
74 struct ceph_msgpool msgpool_subscribe_ack; 76 struct rb_root generic_request_tree;
75 struct ceph_msgpool msgpool_statfs_reply; 77 int num_generic_requests;
76 struct ceph_msgpool msgpool_auth_reply;
77
78 /* pending statfs requests */
79 struct rb_root statfs_request_tree;
80 int num_statfs_requests;
81 u64 last_tid; 78 u64 last_tid;
82 79
83 /* mds/osd map */ 80 /* mds/osd map */
diff --git a/fs/ceph/msgpool.c b/fs/ceph/msgpool.c
index ca3b44a89f2d..dd65a6438131 100644
--- a/fs/ceph/msgpool.c
+++ b/fs/ceph/msgpool.c
@@ -7,180 +7,58 @@
7 7
8#include "msgpool.h" 8#include "msgpool.h"
9 9
10/* 10static void *alloc_fn(gfp_t gfp_mask, void *arg)
11 * We use msg pools to preallocate memory for messages we expect to 11{
12 * receive over the wire, to avoid getting ourselves into OOM 12 struct ceph_msgpool *pool = arg;
13 * conditions at unexpected times. We take use a few different 13 void *p;
14 * strategies:
15 *
16 * - for request/response type interactions, we preallocate the
17 * memory needed for the response when we generate the request.
18 *
19 * - for messages we can receive at any time from the MDS, we preallocate
20 * a pool of messages we can re-use.
21 *
22 * - for writeback, we preallocate some number of messages to use for
23 * requests and their replies, so that we always make forward
24 * progress.
25 *
26 * The msgpool behaves like a mempool_t, but keeps preallocated
27 * ceph_msgs strung together on a list_head instead of using a pointer
28 * vector. This avoids vector reallocation when we adjust the number
29 * of preallocated items (which happens frequently).
30 */
31 14
15 p = ceph_msg_new(0, pool->front_len, gfp_mask);
16 if (!p)
17 pr_err("msgpool %s alloc failed\n", pool->name);
18 return p;
19}
32 20
33/* 21static void free_fn(void *element, void *arg)
34 * Allocate or release as necessary to meet our target pool size.
35 */
36static int __fill_msgpool(struct ceph_msgpool *pool)
37{ 22{
38 struct ceph_msg *msg; 23 ceph_msg_put(element);
39
40 while (pool->num < pool->min) {
41 dout("fill_msgpool %p %d/%d allocating\n", pool, pool->num,
42 pool->min);
43 spin_unlock(&pool->lock);
44 msg = ceph_msg_new(0, pool->front_len, 0, 0, NULL);
45 spin_lock(&pool->lock);
46 if (IS_ERR(msg))
47 return PTR_ERR(msg);
48 msg->pool = pool;
49 list_add(&msg->list_head, &pool->msgs);
50 pool->num++;
51 }
52 while (pool->num > pool->min) {
53 msg = list_first_entry(&pool->msgs, struct ceph_msg, list_head);
54 dout("fill_msgpool %p %d/%d releasing %p\n", pool, pool->num,
55 pool->min, msg);
56 list_del_init(&msg->list_head);
57 pool->num--;
58 ceph_msg_kfree(msg);
59 }
60 return 0;
61} 24}
62 25
63int ceph_msgpool_init(struct ceph_msgpool *pool, 26int ceph_msgpool_init(struct ceph_msgpool *pool,
64 int front_len, int min, bool blocking) 27 int front_len, int size, bool blocking, const char *name)
65{ 28{
66 int ret;
67
68 dout("msgpool_init %p front_len %d min %d\n", pool, front_len, min);
69 spin_lock_init(&pool->lock);
70 pool->front_len = front_len; 29 pool->front_len = front_len;
71 INIT_LIST_HEAD(&pool->msgs); 30 pool->pool = mempool_create(size, alloc_fn, free_fn, pool);
72 pool->num = 0; 31 if (!pool->pool)
73 pool->min = min; 32 return -ENOMEM;
74 pool->blocking = blocking; 33 pool->name = name;
75 init_waitqueue_head(&pool->wait); 34 return 0;
76
77 spin_lock(&pool->lock);
78 ret = __fill_msgpool(pool);
79 spin_unlock(&pool->lock);
80 return ret;
81} 35}
82 36
83void ceph_msgpool_destroy(struct ceph_msgpool *pool) 37void ceph_msgpool_destroy(struct ceph_msgpool *pool)
84{ 38{
85 dout("msgpool_destroy %p\n", pool); 39 mempool_destroy(pool->pool);
86 spin_lock(&pool->lock);
87 pool->min = 0;
88 __fill_msgpool(pool);
89 spin_unlock(&pool->lock);
90} 40}
91 41
92int ceph_msgpool_resv(struct ceph_msgpool *pool, int delta) 42struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool,
43 int front_len)
93{ 44{
94 int ret; 45 if (front_len > pool->front_len) {
95 46 pr_err("msgpool_get pool %s need front %d, pool size is %d\n",
96 spin_lock(&pool->lock); 47 pool->name, front_len, pool->front_len);
97 dout("msgpool_resv %p delta %d\n", pool, delta);
98 pool->min += delta;
99 ret = __fill_msgpool(pool);
100 spin_unlock(&pool->lock);
101 return ret;
102}
103
104struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, int front_len)
105{
106 wait_queue_t wait;
107 struct ceph_msg *msg;
108
109 if (front_len && front_len > pool->front_len) {
110 pr_err("msgpool_get pool %p need front %d, pool size is %d\n",
111 pool, front_len, pool->front_len);
112 WARN_ON(1); 48 WARN_ON(1);
113 49
114 /* try to alloc a fresh message */ 50 /* try to alloc a fresh message */
115 msg = ceph_msg_new(0, front_len, 0, 0, NULL); 51 return ceph_msg_new(0, front_len, GFP_NOFS);
116 if (!IS_ERR(msg))
117 return msg;
118 }
119
120 if (!front_len)
121 front_len = pool->front_len;
122
123 if (pool->blocking) {
124 /* mempool_t behavior; first try to alloc */
125 msg = ceph_msg_new(0, front_len, 0, 0, NULL);
126 if (!IS_ERR(msg))
127 return msg;
128 } 52 }
129 53
130 while (1) { 54 return mempool_alloc(pool->pool, GFP_NOFS);
131 spin_lock(&pool->lock);
132 if (likely(pool->num)) {
133 msg = list_entry(pool->msgs.next, struct ceph_msg,
134 list_head);
135 list_del_init(&msg->list_head);
136 pool->num--;
137 dout("msgpool_get %p got %p, now %d/%d\n", pool, msg,
138 pool->num, pool->min);
139 spin_unlock(&pool->lock);
140 return msg;
141 }
142 pr_err("msgpool_get %p now %d/%d, %s\n", pool, pool->num,
143 pool->min, pool->blocking ? "waiting" : "may fail");
144 spin_unlock(&pool->lock);
145
146 if (!pool->blocking) {
147 WARN_ON(1);
148
149 /* maybe we can allocate it now? */
150 msg = ceph_msg_new(0, front_len, 0, 0, NULL);
151 if (!IS_ERR(msg))
152 return msg;
153
154 pr_err("msgpool_get %p empty + alloc failed\n", pool);
155 return ERR_PTR(-ENOMEM);
156 }
157
158 init_wait(&wait);
159 prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
160 schedule();
161 finish_wait(&pool->wait, &wait);
162 }
163} 55}
164 56
165void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg) 57void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg)
166{ 58{
167 spin_lock(&pool->lock); 59 /* reset msg front_len; user may have changed it */
168 if (pool->num < pool->min) { 60 msg->front.iov_len = pool->front_len;
169 /* reset msg front_len; user may have changed it */ 61 msg->hdr.front_len = cpu_to_le32(pool->front_len);
170 msg->front.iov_len = pool->front_len;
171 msg->hdr.front_len = cpu_to_le32(pool->front_len);
172 62
173 kref_set(&msg->kref, 1); /* retake a single ref */ 63 kref_init(&msg->kref); /* retake single ref */
174 list_add(&msg->list_head, &pool->msgs);
175 pool->num++;
176 dout("msgpool_put %p reclaim %p, now %d/%d\n", pool, msg,
177 pool->num, pool->min);
178 spin_unlock(&pool->lock);
179 wake_up(&pool->wait);
180 } else {
181 dout("msgpool_put %p drop %p, at %d/%d\n", pool, msg,
182 pool->num, pool->min);
183 spin_unlock(&pool->lock);
184 ceph_msg_kfree(msg);
185 }
186} 64}
diff --git a/fs/ceph/msgpool.h b/fs/ceph/msgpool.h
index bc834bfcd720..a362605f9368 100644
--- a/fs/ceph/msgpool.h
+++ b/fs/ceph/msgpool.h
@@ -1,6 +1,7 @@
1#ifndef _FS_CEPH_MSGPOOL 1#ifndef _FS_CEPH_MSGPOOL
2#define _FS_CEPH_MSGPOOL 2#define _FS_CEPH_MSGPOOL
3 3
4#include <linux/mempool.h>
4#include "messenger.h" 5#include "messenger.h"
5 6
6/* 7/*
@@ -8,18 +9,15 @@
8 * avoid unexpected OOM conditions. 9 * avoid unexpected OOM conditions.
9 */ 10 */
10struct ceph_msgpool { 11struct ceph_msgpool {
11 spinlock_t lock; 12 const char *name;
13 mempool_t *pool;
12 int front_len; /* preallocated payload size */ 14 int front_len; /* preallocated payload size */
13 struct list_head msgs; /* msgs in the pool; each has 1 ref */
14 int num, min; /* cur, min # msgs in the pool */
15 bool blocking;
16 wait_queue_head_t wait;
17}; 15};
18 16
19extern int ceph_msgpool_init(struct ceph_msgpool *pool, 17extern int ceph_msgpool_init(struct ceph_msgpool *pool,
20 int front_len, int size, bool blocking); 18 int front_len, int size, bool blocking,
19 const char *name);
21extern void ceph_msgpool_destroy(struct ceph_msgpool *pool); 20extern void ceph_msgpool_destroy(struct ceph_msgpool *pool);
22extern int ceph_msgpool_resv(struct ceph_msgpool *, int delta);
23extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *, 21extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *,
24 int front_len); 22 int front_len);
25extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *); 23extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *);
diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h
index 8aaab414f3f8..892a0298dfdf 100644
--- a/fs/ceph/msgr.h
+++ b/fs/ceph/msgr.h
@@ -50,7 +50,6 @@ struct ceph_entity_name {
50#define CEPH_ENTITY_TYPE_MDS 0x02 50#define CEPH_ENTITY_TYPE_MDS 0x02
51#define CEPH_ENTITY_TYPE_OSD 0x04 51#define CEPH_ENTITY_TYPE_OSD 0x04
52#define CEPH_ENTITY_TYPE_CLIENT 0x08 52#define CEPH_ENTITY_TYPE_CLIENT 0x08
53#define CEPH_ENTITY_TYPE_ADMIN 0x10
54#define CEPH_ENTITY_TYPE_AUTH 0x20 53#define CEPH_ENTITY_TYPE_AUTH 0x20
55 54
56#define CEPH_ENTITY_TYPE_ANY 0xFF 55#define CEPH_ENTITY_TYPE_ANY 0xFF
@@ -120,7 +119,7 @@ struct ceph_msg_connect_reply {
120/* 119/*
121 * message header 120 * message header
122 */ 121 */
123struct ceph_msg_header { 122struct ceph_msg_header_old {
124 __le64 seq; /* message seq# for this session */ 123 __le64 seq; /* message seq# for this session */
125 __le64 tid; /* transaction id */ 124 __le64 tid; /* transaction id */
126 __le16 type; /* message type */ 125 __le16 type; /* message type */
@@ -138,6 +137,24 @@ struct ceph_msg_header {
138 __le32 crc; /* header crc32c */ 137 __le32 crc; /* header crc32c */
139} __attribute__ ((packed)); 138} __attribute__ ((packed));
140 139
140struct ceph_msg_header {
141 __le64 seq; /* message seq# for this session */
142 __le64 tid; /* transaction id */
143 __le16 type; /* message type */
144 __le16 priority; /* priority. higher value == higher priority */
145 __le16 version; /* version of message encoding */
146
147 __le32 front_len; /* bytes in main payload */
148 __le32 middle_len;/* bytes in middle payload */
149 __le32 data_len; /* bytes of data payload */
150 __le16 data_off; /* sender: include full offset;
151 receiver: mask against ~PAGE_MASK */
152
153 struct ceph_entity_name src;
154 __le32 reserved;
155 __le32 crc; /* header crc32c */
156} __attribute__ ((packed));
157
141#define CEPH_MSG_PRIO_LOW 64 158#define CEPH_MSG_PRIO_LOW 64
142#define CEPH_MSG_PRIO_DEFAULT 127 159#define CEPH_MSG_PRIO_DEFAULT 127
143#define CEPH_MSG_PRIO_HIGH 196 160#define CEPH_MSG_PRIO_HIGH 196
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index dbe63db9762f..d25b4add85b4 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -16,7 +16,7 @@
16#define OSD_OP_FRONT_LEN 4096 16#define OSD_OP_FRONT_LEN 4096
17#define OSD_OPREPLY_FRONT_LEN 512 17#define OSD_OPREPLY_FRONT_LEN 512
18 18
19const static struct ceph_connection_operations osd_con_ops; 19static const struct ceph_connection_operations osd_con_ops;
20static int __kick_requests(struct ceph_osd_client *osdc, 20static int __kick_requests(struct ceph_osd_client *osdc,
21 struct ceph_osd *kickosd); 21 struct ceph_osd *kickosd);
22 22
@@ -147,7 +147,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
147 req = kzalloc(sizeof(*req), GFP_NOFS); 147 req = kzalloc(sizeof(*req), GFP_NOFS);
148 } 148 }
149 if (req == NULL) 149 if (req == NULL)
150 return ERR_PTR(-ENOMEM); 150 return NULL;
151 151
152 req->r_osdc = osdc; 152 req->r_osdc = osdc;
153 req->r_mempool = use_mempool; 153 req->r_mempool = use_mempool;
@@ -164,10 +164,10 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
164 msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0); 164 msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
165 else 165 else
166 msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, 166 msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY,
167 OSD_OPREPLY_FRONT_LEN, 0, 0, NULL); 167 OSD_OPREPLY_FRONT_LEN, GFP_NOFS);
168 if (IS_ERR(msg)) { 168 if (!msg) {
169 ceph_osdc_put_request(req); 169 ceph_osdc_put_request(req);
170 return ERR_PTR(PTR_ERR(msg)); 170 return NULL;
171 } 171 }
172 req->r_reply = msg; 172 req->r_reply = msg;
173 173
@@ -178,10 +178,10 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
178 if (use_mempool) 178 if (use_mempool)
179 msg = ceph_msgpool_get(&osdc->msgpool_op, 0); 179 msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
180 else 180 else
181 msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, 0, 0, NULL); 181 msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, GFP_NOFS);
182 if (IS_ERR(msg)) { 182 if (!msg) {
183 ceph_osdc_put_request(req); 183 ceph_osdc_put_request(req);
184 return ERR_PTR(PTR_ERR(msg)); 184 return NULL;
185 } 185 }
186 msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP); 186 msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
187 memset(msg->front.iov_base, 0, msg->front.iov_len); 187 memset(msg->front.iov_base, 0, msg->front.iov_len);
@@ -361,8 +361,13 @@ static void put_osd(struct ceph_osd *osd)
361{ 361{
362 dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref), 362 dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
363 atomic_read(&osd->o_ref) - 1); 363 atomic_read(&osd->o_ref) - 1);
364 if (atomic_dec_and_test(&osd->o_ref)) 364 if (atomic_dec_and_test(&osd->o_ref)) {
365 struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth;
366
367 if (osd->o_authorizer)
368 ac->ops->destroy_authorizer(ac, osd->o_authorizer);
365 kfree(osd); 369 kfree(osd);
370 }
366} 371}
367 372
368/* 373/*
@@ -413,11 +418,22 @@ static void remove_old_osds(struct ceph_osd_client *osdc, int remove_all)
413 */ 418 */
414static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) 419static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
415{ 420{
421 struct ceph_osd_request *req;
416 int ret = 0; 422 int ret = 0;
417 423
418 dout("__reset_osd %p osd%d\n", osd, osd->o_osd); 424 dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
419 if (list_empty(&osd->o_requests)) { 425 if (list_empty(&osd->o_requests)) {
420 __remove_osd(osdc, osd); 426 __remove_osd(osdc, osd);
427 } else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd],
428 &osd->o_con.peer_addr,
429 sizeof(osd->o_con.peer_addr)) == 0 &&
430 !ceph_con_opened(&osd->o_con)) {
431 dout(" osd addr hasn't changed and connection never opened,"
432 " letting msgr retry");
433 /* touch each r_stamp for handle_timeout()'s benfit */
434 list_for_each_entry(req, &osd->o_requests, r_osd_item)
435 req->r_stamp = jiffies;
436 ret = -EAGAIN;
421 } else { 437 } else {
422 ceph_con_close(&osd->o_con); 438 ceph_con_close(&osd->o_con);
423 ceph_con_open(&osd->o_con, &osdc->osdmap->osd_addr[osd->o_osd]); 439 ceph_con_open(&osd->o_con, &osdc->osdmap->osd_addr[osd->o_osd]);
@@ -554,7 +570,8 @@ static int __map_osds(struct ceph_osd_client *osdc,
554{ 570{
555 struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base; 571 struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
556 struct ceph_pg pgid; 572 struct ceph_pg pgid;
557 int o = -1; 573 int acting[CEPH_PG_MAX_SIZE];
574 int o = -1, num = 0;
558 int err; 575 int err;
559 576
560 dout("map_osds %p tid %lld\n", req, req->r_tid); 577 dout("map_osds %p tid %lld\n", req, req->r_tid);
@@ -565,10 +582,16 @@ static int __map_osds(struct ceph_osd_client *osdc,
565 pgid = reqhead->layout.ol_pgid; 582 pgid = reqhead->layout.ol_pgid;
566 req->r_pgid = pgid; 583 req->r_pgid = pgid;
567 584
568 o = ceph_calc_pg_primary(osdc->osdmap, pgid); 585 err = ceph_calc_pg_acting(osdc->osdmap, pgid, acting);
586 if (err > 0) {
587 o = acting[0];
588 num = err;
589 }
569 590
570 if ((req->r_osd && req->r_osd->o_osd == o && 591 if ((req->r_osd && req->r_osd->o_osd == o &&
571 req->r_sent >= req->r_osd->o_incarnation) || 592 req->r_sent >= req->r_osd->o_incarnation &&
593 req->r_num_pg_osds == num &&
594 memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) ||
572 (req->r_osd == NULL && o == -1)) 595 (req->r_osd == NULL && o == -1))
573 return 0; /* no change */ 596 return 0; /* no change */
574 597
@@ -576,6 +599,10 @@ static int __map_osds(struct ceph_osd_client *osdc,
576 req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o, 599 req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o,
577 req->r_osd ? req->r_osd->o_osd : -1); 600 req->r_osd ? req->r_osd->o_osd : -1);
578 601
602 /* record full pg acting set */
603 memcpy(req->r_pg_osds, acting, sizeof(acting[0]) * num);
604 req->r_num_pg_osds = num;
605
579 if (req->r_osd) { 606 if (req->r_osd) {
580 __cancel_request(req); 607 __cancel_request(req);
581 list_del_init(&req->r_osd_item); 608 list_del_init(&req->r_osd_item);
@@ -601,7 +628,7 @@ static int __map_osds(struct ceph_osd_client *osdc,
601 __remove_osd_from_lru(req->r_osd); 628 __remove_osd_from_lru(req->r_osd);
602 list_add(&req->r_osd_item, &req->r_osd->o_requests); 629 list_add(&req->r_osd_item, &req->r_osd->o_requests);
603 } 630 }
604 err = 1; /* osd changed */ 631 err = 1; /* osd or pg changed */
605 632
606out: 633out:
607 return err; 634 return err;
@@ -633,7 +660,7 @@ static int __send_request(struct ceph_osd_client *osdc,
633 reqhead->flags |= cpu_to_le32(req->r_flags); /* e.g., RETRY */ 660 reqhead->flags |= cpu_to_le32(req->r_flags); /* e.g., RETRY */
634 reqhead->reassert_version = req->r_reassert_version; 661 reqhead->reassert_version = req->r_reassert_version;
635 662
636 req->r_sent_stamp = jiffies; 663 req->r_stamp = jiffies;
637 list_move_tail(&osdc->req_lru, &req->r_req_lru_item); 664 list_move_tail(&osdc->req_lru, &req->r_req_lru_item);
638 665
639 ceph_msg_get(req->r_request); /* send consumes a ref */ 666 ceph_msg_get(req->r_request); /* send consumes a ref */
@@ -660,7 +687,7 @@ static void handle_timeout(struct work_struct *work)
660 unsigned long timeout = osdc->client->mount_args->osd_timeout * HZ; 687 unsigned long timeout = osdc->client->mount_args->osd_timeout * HZ;
661 unsigned long keepalive = 688 unsigned long keepalive =
662 osdc->client->mount_args->osd_keepalive_timeout * HZ; 689 osdc->client->mount_args->osd_keepalive_timeout * HZ;
663 unsigned long last_sent = 0; 690 unsigned long last_stamp = 0;
664 struct rb_node *p; 691 struct rb_node *p;
665 struct list_head slow_osds; 692 struct list_head slow_osds;
666 693
@@ -693,16 +720,16 @@ static void handle_timeout(struct work_struct *work)
693 * should mark the osd as failed and we should find out about 720 * should mark the osd as failed and we should find out about
694 * it from an updated osd map. 721 * it from an updated osd map.
695 */ 722 */
696 while (!list_empty(&osdc->req_lru)) { 723 while (timeout && !list_empty(&osdc->req_lru)) {
697 req = list_entry(osdc->req_lru.next, struct ceph_osd_request, 724 req = list_entry(osdc->req_lru.next, struct ceph_osd_request,
698 r_req_lru_item); 725 r_req_lru_item);
699 726
700 if (time_before(jiffies, req->r_sent_stamp + timeout)) 727 if (time_before(jiffies, req->r_stamp + timeout))
701 break; 728 break;
702 729
703 BUG_ON(req == last_req && req->r_sent_stamp == last_sent); 730 BUG_ON(req == last_req && req->r_stamp == last_stamp);
704 last_req = req; 731 last_req = req;
705 last_sent = req->r_sent_stamp; 732 last_stamp = req->r_stamp;
706 733
707 osd = req->r_osd; 734 osd = req->r_osd;
708 BUG_ON(!osd); 735 BUG_ON(!osd);
@@ -718,7 +745,7 @@ static void handle_timeout(struct work_struct *work)
718 */ 745 */
719 INIT_LIST_HEAD(&slow_osds); 746 INIT_LIST_HEAD(&slow_osds);
720 list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) { 747 list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) {
721 if (time_before(jiffies, req->r_sent_stamp + keepalive)) 748 if (time_before(jiffies, req->r_stamp + keepalive))
722 break; 749 break;
723 750
724 osd = req->r_osd; 751 osd = req->r_osd;
@@ -768,16 +795,18 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
768 struct ceph_osd_request *req; 795 struct ceph_osd_request *req;
769 u64 tid; 796 u64 tid;
770 int numops, object_len, flags; 797 int numops, object_len, flags;
798 s32 result;
771 799
772 tid = le64_to_cpu(msg->hdr.tid); 800 tid = le64_to_cpu(msg->hdr.tid);
773 if (msg->front.iov_len < sizeof(*rhead)) 801 if (msg->front.iov_len < sizeof(*rhead))
774 goto bad; 802 goto bad;
775 numops = le32_to_cpu(rhead->num_ops); 803 numops = le32_to_cpu(rhead->num_ops);
776 object_len = le32_to_cpu(rhead->object_len); 804 object_len = le32_to_cpu(rhead->object_len);
805 result = le32_to_cpu(rhead->result);
777 if (msg->front.iov_len != sizeof(*rhead) + object_len + 806 if (msg->front.iov_len != sizeof(*rhead) + object_len +
778 numops * sizeof(struct ceph_osd_op)) 807 numops * sizeof(struct ceph_osd_op))
779 goto bad; 808 goto bad;
780 dout("handle_reply %p tid %llu\n", msg, tid); 809 dout("handle_reply %p tid %llu result %d\n", msg, tid, (int)result);
781 810
782 /* lookup */ 811 /* lookup */
783 mutex_lock(&osdc->request_mutex); 812 mutex_lock(&osdc->request_mutex);
@@ -823,7 +852,8 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
823 dout("handle_reply tid %llu flags %d\n", tid, flags); 852 dout("handle_reply tid %llu flags %d\n", tid, flags);
824 853
825 /* either this is a read, or we got the safe response */ 854 /* either this is a read, or we got the safe response */
826 if ((flags & CEPH_OSD_FLAG_ONDISK) || 855 if (result < 0 ||
856 (flags & CEPH_OSD_FLAG_ONDISK) ||
827 ((flags & CEPH_OSD_FLAG_WRITE) == 0)) 857 ((flags & CEPH_OSD_FLAG_WRITE) == 0))
828 __unregister_request(osdc, req); 858 __unregister_request(osdc, req);
829 859
@@ -862,7 +892,9 @@ static int __kick_requests(struct ceph_osd_client *osdc,
862 892
863 dout("kick_requests osd%d\n", kickosd ? kickosd->o_osd : -1); 893 dout("kick_requests osd%d\n", kickosd ? kickosd->o_osd : -1);
864 if (kickosd) { 894 if (kickosd) {
865 __reset_osd(osdc, kickosd); 895 err = __reset_osd(osdc, kickosd);
896 if (err == -EAGAIN)
897 return 1;
866 } else { 898 } else {
867 for (p = rb_first(&osdc->osds); p; p = n) { 899 for (p = rb_first(&osdc->osds); p; p = n) {
868 struct ceph_osd *osd = 900 struct ceph_osd *osd =
@@ -913,7 +945,7 @@ static int __kick_requests(struct ceph_osd_client *osdc,
913 945
914kick: 946kick:
915 dout("kicking %p tid %llu osd%d\n", req, req->r_tid, 947 dout("kicking %p tid %llu osd%d\n", req, req->r_tid,
916 req->r_osd->o_osd); 948 req->r_osd ? req->r_osd->o_osd : -1);
917 req->r_flags |= CEPH_OSD_FLAG_RETRY; 949 req->r_flags |= CEPH_OSD_FLAG_RETRY;
918 err = __send_request(osdc, req); 950 err = __send_request(osdc, req);
919 if (err) { 951 if (err) {
@@ -1051,6 +1083,7 @@ done:
1051 if (newmap) 1083 if (newmap)
1052 kick_requests(osdc, NULL); 1084 kick_requests(osdc, NULL);
1053 up_read(&osdc->map_sem); 1085 up_read(&osdc->map_sem);
1086 wake_up(&osdc->client->auth_wq);
1054 return; 1087 return;
1055 1088
1056bad: 1089bad:
@@ -1060,45 +1093,6 @@ bad:
1060 return; 1093 return;
1061} 1094}
1062 1095
1063
1064/*
1065 * A read request prepares specific pages that data is to be read into.
1066 * When a message is being read off the wire, we call prepare_pages to
1067 * find those pages.
1068 * 0 = success, -1 failure.
1069 */
1070static int __prepare_pages(struct ceph_connection *con,
1071 struct ceph_msg_header *hdr,
1072 struct ceph_osd_request *req,
1073 u64 tid,
1074 struct ceph_msg *m)
1075{
1076 struct ceph_osd *osd = con->private;
1077 struct ceph_osd_client *osdc;
1078 int ret = -1;
1079 int data_len = le32_to_cpu(hdr->data_len);
1080 unsigned data_off = le16_to_cpu(hdr->data_off);
1081
1082 int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
1083
1084 if (!osd)
1085 return -1;
1086
1087 osdc = osd->o_osdc;
1088
1089 dout("__prepare_pages on msg %p tid %llu, has %d pages, want %d\n", m,
1090 tid, req->r_num_pages, want);
1091 if (unlikely(req->r_num_pages < want))
1092 goto out;
1093 m->pages = req->r_pages;
1094 m->nr_pages = req->r_num_pages;
1095 ret = 0; /* success */
1096out:
1097 BUG_ON(ret < 0 || m->nr_pages < want);
1098
1099 return ret;
1100}
1101
1102/* 1096/*
1103 * Register request, send initial attempt. 1097 * Register request, send initial attempt.
1104 */ 1098 */
@@ -1225,11 +1219,13 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
1225 if (!osdc->req_mempool) 1219 if (!osdc->req_mempool)
1226 goto out; 1220 goto out;
1227 1221
1228 err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true); 1222 err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true,
1223 "osd_op");
1229 if (err < 0) 1224 if (err < 0)
1230 goto out_mempool; 1225 goto out_mempool;
1231 err = ceph_msgpool_init(&osdc->msgpool_op_reply, 1226 err = ceph_msgpool_init(&osdc->msgpool_op_reply,
1232 OSD_OPREPLY_FRONT_LEN, 10, true); 1227 OSD_OPREPLY_FRONT_LEN, 10, true,
1228 "osd_op_reply");
1233 if (err < 0) 1229 if (err < 0)
1234 goto out_msgpool; 1230 goto out_msgpool;
1235 return 0; 1231 return 0;
@@ -1275,8 +1271,8 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
1275 CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, 1271 CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
1276 NULL, 0, truncate_seq, truncate_size, NULL, 1272 NULL, 0, truncate_seq, truncate_size, NULL,
1277 false, 1); 1273 false, 1);
1278 if (IS_ERR(req)) 1274 if (!req)
1279 return PTR_ERR(req); 1275 return -ENOMEM;
1280 1276
1281 /* it may be a short read due to an object boundary */ 1277 /* it may be a short read due to an object boundary */
1282 req->r_pages = pages; 1278 req->r_pages = pages;
@@ -1318,8 +1314,8 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
1318 snapc, do_sync, 1314 snapc, do_sync,
1319 truncate_seq, truncate_size, mtime, 1315 truncate_seq, truncate_size, mtime,
1320 nofail, 1); 1316 nofail, 1);
1321 if (IS_ERR(req)) 1317 if (!req)
1322 return PTR_ERR(req); 1318 return -ENOMEM;
1323 1319
1324 /* it may be a short write due to an object boundary */ 1320 /* it may be a short write due to an object boundary */
1325 req->r_pages = pages; 1321 req->r_pages = pages;
@@ -1367,7 +1363,8 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
1367} 1363}
1368 1364
1369/* 1365/*
1370 * lookup and return message for incoming reply 1366 * lookup and return message for incoming reply. set up reply message
1367 * pages.
1371 */ 1368 */
1372static struct ceph_msg *get_reply(struct ceph_connection *con, 1369static struct ceph_msg *get_reply(struct ceph_connection *con,
1373 struct ceph_msg_header *hdr, 1370 struct ceph_msg_header *hdr,
@@ -1380,7 +1377,6 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
1380 int front = le32_to_cpu(hdr->front_len); 1377 int front = le32_to_cpu(hdr->front_len);
1381 int data_len = le32_to_cpu(hdr->data_len); 1378 int data_len = le32_to_cpu(hdr->data_len);
1382 u64 tid; 1379 u64 tid;
1383 int err;
1384 1380
1385 tid = le64_to_cpu(hdr->tid); 1381 tid = le64_to_cpu(hdr->tid);
1386 mutex_lock(&osdc->request_mutex); 1382 mutex_lock(&osdc->request_mutex);
@@ -1398,13 +1394,14 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
1398 req->r_reply, req->r_con_filling_msg); 1394 req->r_reply, req->r_con_filling_msg);
1399 ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply); 1395 ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply);
1400 ceph_con_put(req->r_con_filling_msg); 1396 ceph_con_put(req->r_con_filling_msg);
1397 req->r_con_filling_msg = NULL;
1401 } 1398 }
1402 1399
1403 if (front > req->r_reply->front.iov_len) { 1400 if (front > req->r_reply->front.iov_len) {
1404 pr_warning("get_reply front %d > preallocated %d\n", 1401 pr_warning("get_reply front %d > preallocated %d\n",
1405 front, (int)req->r_reply->front.iov_len); 1402 front, (int)req->r_reply->front.iov_len);
1406 m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, 0, 0, NULL); 1403 m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS);
1407 if (IS_ERR(m)) 1404 if (!m)
1408 goto out; 1405 goto out;
1409 ceph_msg_put(req->r_reply); 1406 ceph_msg_put(req->r_reply);
1410 req->r_reply = m; 1407 req->r_reply = m;
@@ -1412,12 +1409,19 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
1412 m = ceph_msg_get(req->r_reply); 1409 m = ceph_msg_get(req->r_reply);
1413 1410
1414 if (data_len > 0) { 1411 if (data_len > 0) {
1415 err = __prepare_pages(con, hdr, req, tid, m); 1412 unsigned data_off = le16_to_cpu(hdr->data_off);
1416 if (err < 0) { 1413 int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
1414
1415 if (unlikely(req->r_num_pages < want)) {
1416 pr_warning("tid %lld reply %d > expected %d pages\n",
1417 tid, want, m->nr_pages);
1417 *skip = 1; 1418 *skip = 1;
1418 ceph_msg_put(m); 1419 ceph_msg_put(m);
1419 m = ERR_PTR(err); 1420 m = NULL;
1421 goto out;
1420 } 1422 }
1423 m->pages = req->r_pages;
1424 m->nr_pages = req->r_num_pages;
1421 } 1425 }
1422 *skip = 0; 1426 *skip = 0;
1423 req->r_con_filling_msg = ceph_con_get(con); 1427 req->r_con_filling_msg = ceph_con_get(con);
@@ -1439,7 +1443,7 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con,
1439 1443
1440 switch (type) { 1444 switch (type) {
1441 case CEPH_MSG_OSD_MAP: 1445 case CEPH_MSG_OSD_MAP:
1442 return ceph_msg_new(type, front, 0, 0, NULL); 1446 return ceph_msg_new(type, front, GFP_NOFS);
1443 case CEPH_MSG_OSD_OPREPLY: 1447 case CEPH_MSG_OSD_OPREPLY:
1444 return get_reply(con, hdr, skip); 1448 return get_reply(con, hdr, skip);
1445 default: 1449 default:
@@ -1525,7 +1529,7 @@ static int invalidate_authorizer(struct ceph_connection *con)
1525 return ceph_monc_validate_auth(&osdc->client->monc); 1529 return ceph_monc_validate_auth(&osdc->client->monc);
1526} 1530}
1527 1531
1528const static struct ceph_connection_operations osd_con_ops = { 1532static const struct ceph_connection_operations osd_con_ops = {
1529 .get = get_osd_con, 1533 .get = get_osd_con,
1530 .put = put_osd_con, 1534 .put = put_osd_con,
1531 .dispatch = dispatch, 1535 .dispatch = dispatch,
diff --git a/fs/ceph/osd_client.h b/fs/ceph/osd_client.h
index 1b1a3ca43afc..ce776989ef6a 100644
--- a/fs/ceph/osd_client.h
+++ b/fs/ceph/osd_client.h
@@ -48,6 +48,8 @@ struct ceph_osd_request {
48 struct list_head r_osd_item; 48 struct list_head r_osd_item;
49 struct ceph_osd *r_osd; 49 struct ceph_osd *r_osd;
50 struct ceph_pg r_pgid; 50 struct ceph_pg r_pgid;
51 int r_pg_osds[CEPH_PG_MAX_SIZE];
52 int r_num_pg_osds;
51 53
52 struct ceph_connection *r_con_filling_msg; 54 struct ceph_connection *r_con_filling_msg;
53 55
@@ -66,11 +68,10 @@ struct ceph_osd_request {
66 struct list_head r_unsafe_item; 68 struct list_head r_unsafe_item;
67 69
68 struct inode *r_inode; /* for use by callbacks */ 70 struct inode *r_inode; /* for use by callbacks */
69 struct writeback_control *r_wbc; /* ditto */
70 71
71 char r_oid[40]; /* object name */ 72 char r_oid[40]; /* object name */
72 int r_oid_len; 73 int r_oid_len;
73 unsigned long r_sent_stamp; 74 unsigned long r_stamp; /* send OR check time */
74 bool r_resend; /* msg send failed, needs retry */ 75 bool r_resend; /* msg send failed, needs retry */
75 76
76 struct ceph_file_layout r_file_layout; 77 struct ceph_file_layout r_file_layout;
diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
index b83f2692b835..ddc656fb5c05 100644
--- a/fs/ceph/osdmap.c
+++ b/fs/ceph/osdmap.c
@@ -1,4 +1,7 @@
1 1
2#include "ceph_debug.h"
3
4#include <linux/slab.h>
2#include <asm/div64.h> 5#include <asm/div64.h>
3 6
4#include "super.h" 7#include "super.h"
@@ -6,7 +9,6 @@
6#include "crush/hash.h" 9#include "crush/hash.h"
7#include "crush/mapper.h" 10#include "crush/mapper.h"
8#include "decode.h" 11#include "decode.h"
9#include "ceph_debug.h"
10 12
11char *ceph_osdmap_state_str(char *str, int len, int state) 13char *ceph_osdmap_state_str(char *str, int len, int state)
12{ 14{
@@ -312,71 +314,6 @@ bad:
312 return ERR_PTR(err); 314 return ERR_PTR(err);
313} 315}
314 316
315
316/*
317 * osd map
318 */
319void ceph_osdmap_destroy(struct ceph_osdmap *map)
320{
321 dout("osdmap_destroy %p\n", map);
322 if (map->crush)
323 crush_destroy(map->crush);
324 while (!RB_EMPTY_ROOT(&map->pg_temp)) {
325 struct ceph_pg_mapping *pg =
326 rb_entry(rb_first(&map->pg_temp),
327 struct ceph_pg_mapping, node);
328 rb_erase(&pg->node, &map->pg_temp);
329 kfree(pg);
330 }
331 while (!RB_EMPTY_ROOT(&map->pg_pools)) {
332 struct ceph_pg_pool_info *pi =
333 rb_entry(rb_first(&map->pg_pools),
334 struct ceph_pg_pool_info, node);
335 rb_erase(&pi->node, &map->pg_pools);
336 kfree(pi);
337 }
338 kfree(map->osd_state);
339 kfree(map->osd_weight);
340 kfree(map->osd_addr);
341 kfree(map);
342}
343
344/*
345 * adjust max osd value. reallocate arrays.
346 */
347static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
348{
349 u8 *state;
350 struct ceph_entity_addr *addr;
351 u32 *weight;
352
353 state = kcalloc(max, sizeof(*state), GFP_NOFS);
354 addr = kcalloc(max, sizeof(*addr), GFP_NOFS);
355 weight = kcalloc(max, sizeof(*weight), GFP_NOFS);
356 if (state == NULL || addr == NULL || weight == NULL) {
357 kfree(state);
358 kfree(addr);
359 kfree(weight);
360 return -ENOMEM;
361 }
362
363 /* copy old? */
364 if (map->osd_state) {
365 memcpy(state, map->osd_state, map->max_osd*sizeof(*state));
366 memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr));
367 memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight));
368 kfree(map->osd_state);
369 kfree(map->osd_addr);
370 kfree(map->osd_weight);
371 }
372
373 map->osd_state = state;
374 map->osd_weight = weight;
375 map->osd_addr = addr;
376 map->max_osd = max;
377 return 0;
378}
379
380/* 317/*
381 * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid 318 * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
382 * to a set of osds) 319 * to a set of osds)
@@ -480,6 +417,113 @@ static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id)
480 return NULL; 417 return NULL;
481} 418}
482 419
420static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
421{
422 rb_erase(&pi->node, root);
423 kfree(pi->name);
424 kfree(pi);
425}
426
427void __decode_pool(void **p, struct ceph_pg_pool_info *pi)
428{
429 ceph_decode_copy(p, &pi->v, sizeof(pi->v));
430 calc_pg_masks(pi);
431 *p += le32_to_cpu(pi->v.num_snaps) * sizeof(u64);
432 *p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2;
433}
434
435static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
436{
437 struct ceph_pg_pool_info *pi;
438 u32 num, len, pool;
439
440 ceph_decode_32_safe(p, end, num, bad);
441 dout(" %d pool names\n", num);
442 while (num--) {
443 ceph_decode_32_safe(p, end, pool, bad);
444 ceph_decode_32_safe(p, end, len, bad);
445 dout(" pool %d len %d\n", pool, len);
446 pi = __lookup_pg_pool(&map->pg_pools, pool);
447 if (pi) {
448 kfree(pi->name);
449 pi->name = kmalloc(len + 1, GFP_NOFS);
450 if (pi->name) {
451 memcpy(pi->name, *p, len);
452 pi->name[len] = '\0';
453 dout(" name is %s\n", pi->name);
454 }
455 }
456 *p += len;
457 }
458 return 0;
459
460bad:
461 return -EINVAL;
462}
463
464/*
465 * osd map
466 */
467void ceph_osdmap_destroy(struct ceph_osdmap *map)
468{
469 dout("osdmap_destroy %p\n", map);
470 if (map->crush)
471 crush_destroy(map->crush);
472 while (!RB_EMPTY_ROOT(&map->pg_temp)) {
473 struct ceph_pg_mapping *pg =
474 rb_entry(rb_first(&map->pg_temp),
475 struct ceph_pg_mapping, node);
476 rb_erase(&pg->node, &map->pg_temp);
477 kfree(pg);
478 }
479 while (!RB_EMPTY_ROOT(&map->pg_pools)) {
480 struct ceph_pg_pool_info *pi =
481 rb_entry(rb_first(&map->pg_pools),
482 struct ceph_pg_pool_info, node);
483 __remove_pg_pool(&map->pg_pools, pi);
484 }
485 kfree(map->osd_state);
486 kfree(map->osd_weight);
487 kfree(map->osd_addr);
488 kfree(map);
489}
490
491/*
492 * adjust max osd value. reallocate arrays.
493 */
494static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
495{
496 u8 *state;
497 struct ceph_entity_addr *addr;
498 u32 *weight;
499
500 state = kcalloc(max, sizeof(*state), GFP_NOFS);
501 addr = kcalloc(max, sizeof(*addr), GFP_NOFS);
502 weight = kcalloc(max, sizeof(*weight), GFP_NOFS);
503 if (state == NULL || addr == NULL || weight == NULL) {
504 kfree(state);
505 kfree(addr);
506 kfree(weight);
507 return -ENOMEM;
508 }
509
510 /* copy old? */
511 if (map->osd_state) {
512 memcpy(state, map->osd_state, map->max_osd*sizeof(*state));
513 memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr));
514 memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight));
515 kfree(map->osd_state);
516 kfree(map->osd_addr);
517 kfree(map->osd_weight);
518 }
519
520 map->osd_state = state;
521 map->osd_weight = weight;
522 map->osd_addr = addr;
523 map->max_osd = max;
524 return 0;
525}
526
483/* 527/*
484 * decode a full map. 528 * decode a full map.
485 */ 529 */
@@ -516,7 +560,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
516 ceph_decode_32_safe(p, end, max, bad); 560 ceph_decode_32_safe(p, end, max, bad);
517 while (max--) { 561 while (max--) {
518 ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad); 562 ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad);
519 pi = kmalloc(sizeof(*pi), GFP_NOFS); 563 pi = kzalloc(sizeof(*pi), GFP_NOFS);
520 if (!pi) 564 if (!pi)
521 goto bad; 565 goto bad;
522 pi->id = ceph_decode_32(p); 566 pi->id = ceph_decode_32(p);
@@ -526,13 +570,13 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
526 ev, CEPH_PG_POOL_VERSION); 570 ev, CEPH_PG_POOL_VERSION);
527 goto bad; 571 goto bad;
528 } 572 }
529 ceph_decode_copy(p, &pi->v, sizeof(pi->v)); 573 __decode_pool(p, pi);
530 __insert_pg_pool(&map->pg_pools, pi); 574 __insert_pg_pool(&map->pg_pools, pi);
531 calc_pg_masks(pi);
532 *p += le32_to_cpu(pi->v.num_snaps) * sizeof(u64);
533 *p += le32_to_cpu(pi->v.num_removed_snap_intervals)
534 * sizeof(u64) * 2;
535 } 575 }
576
577 if (version >= 5 && __decode_pool_names(p, end, map) < 0)
578 goto bad;
579
536 ceph_decode_32_safe(p, end, map->pool_max, bad); 580 ceph_decode_32_safe(p, end, map->pool_max, bad);
537 581
538 ceph_decode_32_safe(p, end, map->flags, bad); 582 ceph_decode_32_safe(p, end, map->flags, bad);
@@ -662,7 +706,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
662 len, *p, end); 706 len, *p, end);
663 newcrush = crush_decode(*p, min(*p+len, end)); 707 newcrush = crush_decode(*p, min(*p+len, end));
664 if (IS_ERR(newcrush)) 708 if (IS_ERR(newcrush))
665 return ERR_PTR(PTR_ERR(newcrush)); 709 return ERR_CAST(newcrush);
666 } 710 }
667 711
668 /* new flags? */ 712 /* new flags? */
@@ -706,7 +750,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
706 } 750 }
707 pi = __lookup_pg_pool(&map->pg_pools, pool); 751 pi = __lookup_pg_pool(&map->pg_pools, pool);
708 if (!pi) { 752 if (!pi) {
709 pi = kmalloc(sizeof(*pi), GFP_NOFS); 753 pi = kzalloc(sizeof(*pi), GFP_NOFS);
710 if (!pi) { 754 if (!pi) {
711 err = -ENOMEM; 755 err = -ENOMEM;
712 goto bad; 756 goto bad;
@@ -714,9 +758,10 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
714 pi->id = pool; 758 pi->id = pool;
715 __insert_pg_pool(&map->pg_pools, pi); 759 __insert_pg_pool(&map->pg_pools, pi);
716 } 760 }
717 ceph_decode_copy(p, &pi->v, sizeof(pi->v)); 761 __decode_pool(p, pi);
718 calc_pg_masks(pi);
719 } 762 }
763 if (version >= 5 && __decode_pool_names(p, end, map) < 0)
764 goto bad;
720 765
721 /* old_pool */ 766 /* old_pool */
722 ceph_decode_32_safe(p, end, len, bad); 767 ceph_decode_32_safe(p, end, len, bad);
@@ -725,10 +770,8 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
725 770
726 ceph_decode_32_safe(p, end, pool, bad); 771 ceph_decode_32_safe(p, end, pool, bad);
727 pi = __lookup_pg_pool(&map->pg_pools, pool); 772 pi = __lookup_pg_pool(&map->pg_pools, pool);
728 if (pi) { 773 if (pi)
729 rb_erase(&pi->node, &map->pg_pools); 774 __remove_pg_pool(&map->pg_pools, pi);
730 kfree(pi);
731 }
732 } 775 }
733 776
734 /* new_up */ 777 /* new_up */
@@ -998,12 +1041,33 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
998} 1041}
999 1042
1000/* 1043/*
1044 * Return acting set for given pgid.
1045 */
1046int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
1047 int *acting)
1048{
1049 int rawosds[CEPH_PG_MAX_SIZE], *osds;
1050 int i, o, num = CEPH_PG_MAX_SIZE;
1051
1052 osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
1053 if (!osds)
1054 return -1;
1055
1056 /* primary is first up osd */
1057 o = 0;
1058 for (i = 0; i < num; i++)
1059 if (ceph_osd_is_up(osdmap, osds[i]))
1060 acting[o++] = osds[i];
1061 return o;
1062}
1063
1064/*
1001 * Return primary osd for given pgid, or -1 if none. 1065 * Return primary osd for given pgid, or -1 if none.
1002 */ 1066 */
1003int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid) 1067int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
1004{ 1068{
1005 int rawosds[10], *osds; 1069 int rawosds[CEPH_PG_MAX_SIZE], *osds;
1006 int i, num = ARRAY_SIZE(rawosds); 1070 int i, num = CEPH_PG_MAX_SIZE;
1007 1071
1008 osds = calc_pg_raw(osdmap, pgid, rawosds, &num); 1072 osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
1009 if (!osds) 1073 if (!osds)
@@ -1011,9 +1075,7 @@ int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
1011 1075
1012 /* primary is first up osd */ 1076 /* primary is first up osd */
1013 for (i = 0; i < num; i++) 1077 for (i = 0; i < num; i++)
1014 if (ceph_osd_is_up(osdmap, osds[i])) { 1078 if (ceph_osd_is_up(osdmap, osds[i]))
1015 return osds[i]; 1079 return osds[i];
1016 break;
1017 }
1018 return -1; 1080 return -1;
1019} 1081}
diff --git a/fs/ceph/osdmap.h b/fs/ceph/osdmap.h
index 1fb55afb2642..970b547e510d 100644
--- a/fs/ceph/osdmap.h
+++ b/fs/ceph/osdmap.h
@@ -23,6 +23,7 @@ struct ceph_pg_pool_info {
23 int id; 23 int id;
24 struct ceph_pg_pool v; 24 struct ceph_pg_pool v;
25 int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask; 25 int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask;
26 char *name;
26}; 27};
27 28
28struct ceph_pg_mapping { 29struct ceph_pg_mapping {
@@ -119,6 +120,8 @@ extern int ceph_calc_object_layout(struct ceph_object_layout *ol,
119 const char *oid, 120 const char *oid,
120 struct ceph_file_layout *fl, 121 struct ceph_file_layout *fl,
121 struct ceph_osdmap *osdmap); 122 struct ceph_osdmap *osdmap);
123extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
124 int *acting);
122extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, 125extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
123 struct ceph_pg pgid); 126 struct ceph_pg pgid);
124 127
diff --git a/fs/ceph/pagelist.c b/fs/ceph/pagelist.c
index 370e93695474..b6859f47d364 100644
--- a/fs/ceph/pagelist.c
+++ b/fs/ceph/pagelist.c
@@ -1,4 +1,5 @@
1 1
2#include <linux/gfp.h>
2#include <linux/pagemap.h> 3#include <linux/pagemap.h>
3#include <linux/highmem.h> 4#include <linux/highmem.h>
4 5
@@ -19,7 +20,7 @@ int ceph_pagelist_release(struct ceph_pagelist *pl)
19 20
20static int ceph_pagelist_addpage(struct ceph_pagelist *pl) 21static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
21{ 22{
22 struct page *page = alloc_page(GFP_NOFS); 23 struct page *page = __page_cache_alloc(GFP_NOFS);
23 if (!page) 24 if (!page)
24 return -ENOMEM; 25 return -ENOMEM;
25 pl->room += PAGE_SIZE; 26 pl->room += PAGE_SIZE;
diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h
index 26ac8b89a676..8fcc023056c7 100644
--- a/fs/ceph/rados.h
+++ b/fs/ceph/rados.h
@@ -11,8 +11,10 @@
11/* 11/*
12 * osdmap encoding versions 12 * osdmap encoding versions
13 */ 13 */
14#define CEPH_OSDMAP_INC_VERSION 4 14#define CEPH_OSDMAP_INC_VERSION 5
15#define CEPH_OSDMAP_VERSION 4 15#define CEPH_OSDMAP_INC_VERSION_EXT 5
16#define CEPH_OSDMAP_VERSION 5
17#define CEPH_OSDMAP_VERSION_EXT 5
16 18
17/* 19/*
18 * fs id 20 * fs id
@@ -56,6 +58,7 @@ struct ceph_timespec {
56#define CEPH_PG_LAYOUT_LINEAR 2 58#define CEPH_PG_LAYOUT_LINEAR 2
57#define CEPH_PG_LAYOUT_HYBRID 3 59#define CEPH_PG_LAYOUT_HYBRID 3
58 60
61#define CEPH_PG_MAX_SIZE 16 /* max # osds in a single pg */
59 62
60/* 63/*
61 * placement group. 64 * placement group.
@@ -98,8 +101,8 @@ struct ceph_pg_pool {
98 __le64 snap_seq; /* seq for per-pool snapshot */ 101 __le64 snap_seq; /* seq for per-pool snapshot */
99 __le32 snap_epoch; /* epoch of last snap */ 102 __le32 snap_epoch; /* epoch of last snap */
100 __le32 num_snaps; 103 __le32 num_snaps;
101 __le32 num_removed_snap_intervals; 104 __le32 num_removed_snap_intervals; /* if non-empty, NO per-pool snaps */
102 __le64 uid; 105 __le64 auid; /* who owns the pg */
103} __attribute__ ((packed)); 106} __attribute__ ((packed));
104 107
105/* 108/*
@@ -205,6 +208,7 @@ enum {
205 /* read */ 208 /* read */
206 CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1, 209 CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1,
207 CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2, 210 CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2,
211 CEPH_OSD_OP_CMPXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 3,
208 212
209 /* write */ 213 /* write */
210 CEPH_OSD_OP_SETXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1, 214 CEPH_OSD_OP_SETXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1,
@@ -302,6 +306,22 @@ enum {
302#define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/ 306#define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/
303#define EBLACKLISTED ESHUTDOWN /* blacklisted */ 307#define EBLACKLISTED ESHUTDOWN /* blacklisted */
304 308
309/* xattr comparison */
310enum {
311 CEPH_OSD_CMPXATTR_OP_NOP = 0,
312 CEPH_OSD_CMPXATTR_OP_EQ = 1,
313 CEPH_OSD_CMPXATTR_OP_NE = 2,
314 CEPH_OSD_CMPXATTR_OP_GT = 3,
315 CEPH_OSD_CMPXATTR_OP_GTE = 4,
316 CEPH_OSD_CMPXATTR_OP_LT = 5,
317 CEPH_OSD_CMPXATTR_OP_LTE = 6
318};
319
320enum {
321 CEPH_OSD_CMPXATTR_MODE_STRING = 1,
322 CEPH_OSD_CMPXATTR_MODE_U64 = 2
323};
324
305/* 325/*
306 * an individual object operation. each may be accompanied by some data 326 * an individual object operation. each may be accompanied by some data
307 * payload 327 * payload
@@ -318,6 +338,8 @@ struct ceph_osd_op {
318 struct { 338 struct {
319 __le32 name_len; 339 __le32 name_len;
320 __le32 value_len; 340 __le32 value_len;
341 __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */
342 __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */
321 } __attribute__ ((packed)) xattr; 343 } __attribute__ ((packed)) xattr;
322 struct { 344 struct {
323 __u8 class_len; 345 __u8 class_len;
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index bf2a5f3846a4..c0b26b6badba 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -1,6 +1,7 @@
1#include "ceph_debug.h" 1#include "ceph_debug.h"
2 2
3#include <linux/sort.h> 3#include <linux/sort.h>
4#include <linux/slab.h>
4 5
5#include "super.h" 6#include "super.h"
6#include "decode.h" 7#include "decode.h"
@@ -314,9 +315,9 @@ static int build_snap_context(struct ceph_snap_realm *realm)
314 because we rebuild_snap_realms() works _downward_ in 315 because we rebuild_snap_realms() works _downward_ in
315 hierarchy after each update.) */ 316 hierarchy after each update.) */
316 if (realm->cached_context && 317 if (realm->cached_context &&
317 realm->cached_context->seq <= realm->seq && 318 realm->cached_context->seq == realm->seq &&
318 (!parent || 319 (!parent ||
319 realm->cached_context->seq <= parent->cached_context->seq)) { 320 realm->cached_context->seq >= parent->cached_context->seq)) {
320 dout("build_snap_context %llx %p: %p seq %lld (%d snaps)" 321 dout("build_snap_context %llx %p: %p seq %lld (%d snaps)"
321 " (unchanged)\n", 322 " (unchanged)\n",
322 realm->ino, realm, realm->cached_context, 323 realm->ino, realm, realm->cached_context,
@@ -430,8 +431,7 @@ static int dup_array(u64 **dst, __le64 *src, int num)
430 * Caller must hold snap_rwsem for read (i.e., the realm topology won't 431 * Caller must hold snap_rwsem for read (i.e., the realm topology won't
431 * change). 432 * change).
432 */ 433 */
433void ceph_queue_cap_snap(struct ceph_inode_info *ci, 434void ceph_queue_cap_snap(struct ceph_inode_info *ci)
434 struct ceph_snap_context *snapc)
435{ 435{
436 struct inode *inode = &ci->vfs_inode; 436 struct inode *inode = &ci->vfs_inode;
437 struct ceph_cap_snap *capsnap; 437 struct ceph_cap_snap *capsnap;
@@ -450,10 +450,11 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci,
450 as no new writes are allowed to start when pending, so any 450 as no new writes are allowed to start when pending, so any
451 writes in progress now were started before the previous 451 writes in progress now were started before the previous
452 cap_snap. lucky us. */ 452 cap_snap. lucky us. */
453 dout("queue_cap_snap %p snapc %p seq %llu used %d" 453 dout("queue_cap_snap %p already pending\n", inode);
454 " already pending\n", inode, snapc, snapc->seq, used);
455 kfree(capsnap); 454 kfree(capsnap);
456 } else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR)) { 455 } else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR)) {
456 struct ceph_snap_context *snapc = ci->i_head_snapc;
457
457 igrab(inode); 458 igrab(inode);
458 459
459 atomic_set(&capsnap->nref, 1); 460 atomic_set(&capsnap->nref, 1);
@@ -462,7 +463,6 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci,
462 INIT_LIST_HEAD(&capsnap->flushing_item); 463 INIT_LIST_HEAD(&capsnap->flushing_item);
463 464
464 capsnap->follows = snapc->seq - 1; 465 capsnap->follows = snapc->seq - 1;
465 capsnap->context = ceph_get_snap_context(snapc);
466 capsnap->issued = __ceph_caps_issued(ci, NULL); 466 capsnap->issued = __ceph_caps_issued(ci, NULL);
467 capsnap->dirty = __ceph_caps_dirty(ci); 467 capsnap->dirty = __ceph_caps_dirty(ci);
468 468
@@ -479,7 +479,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci,
479 snapshot. */ 479 snapshot. */
480 capsnap->dirty_pages = ci->i_wrbuffer_ref_head; 480 capsnap->dirty_pages = ci->i_wrbuffer_ref_head;
481 ci->i_wrbuffer_ref_head = 0; 481 ci->i_wrbuffer_ref_head = 0;
482 ceph_put_snap_context(ci->i_head_snapc); 482 capsnap->context = snapc;
483 ci->i_head_snapc = NULL; 483 ci->i_head_snapc = NULL;
484 list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps); 484 list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
485 485
@@ -512,7 +512,7 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
512 struct ceph_cap_snap *capsnap) 512 struct ceph_cap_snap *capsnap)
513{ 513{
514 struct inode *inode = &ci->vfs_inode; 514 struct inode *inode = &ci->vfs_inode;
515 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc; 515 struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
516 516
517 BUG_ON(capsnap->writing); 517 BUG_ON(capsnap->writing);
518 capsnap->size = inode->i_size; 518 capsnap->size = inode->i_size;
@@ -521,15 +521,17 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
521 capsnap->ctime = inode->i_ctime; 521 capsnap->ctime = inode->i_ctime;
522 capsnap->time_warp_seq = ci->i_time_warp_seq; 522 capsnap->time_warp_seq = ci->i_time_warp_seq;
523 if (capsnap->dirty_pages) { 523 if (capsnap->dirty_pages) {
524 dout("finish_cap_snap %p cap_snap %p snapc %p %llu s=%llu " 524 dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu "
525 "still has %d dirty pages\n", inode, capsnap, 525 "still has %d dirty pages\n", inode, capsnap,
526 capsnap->context, capsnap->context->seq, 526 capsnap->context, capsnap->context->seq,
527 capsnap->size, capsnap->dirty_pages); 527 ceph_cap_string(capsnap->dirty), capsnap->size,
528 capsnap->dirty_pages);
528 return 0; 529 return 0;
529 } 530 }
530 dout("finish_cap_snap %p cap_snap %p snapc %p %llu s=%llu clean\n", 531 dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu\n",
531 inode, capsnap, capsnap->context, 532 inode, capsnap, capsnap->context,
532 capsnap->context->seq, capsnap->size); 533 capsnap->context->seq, ceph_cap_string(capsnap->dirty),
534 capsnap->size);
533 535
534 spin_lock(&mdsc->snap_flush_lock); 536 spin_lock(&mdsc->snap_flush_lock);
535 list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list); 537 list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list);
@@ -601,7 +603,7 @@ more:
601 if (lastinode) 603 if (lastinode)
602 iput(lastinode); 604 iput(lastinode);
603 lastinode = inode; 605 lastinode = inode;
604 ceph_queue_cap_snap(ci, realm->cached_context); 606 ceph_queue_cap_snap(ci);
605 spin_lock(&realm->inodes_with_caps_lock); 607 spin_lock(&realm->inodes_with_caps_lock);
606 } 608 }
607 spin_unlock(&realm->inodes_with_caps_lock); 609 spin_unlock(&realm->inodes_with_caps_lock);
@@ -818,11 +820,12 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
818 * queued (again) by ceph_update_snap_trace() 820 * queued (again) by ceph_update_snap_trace()
819 * below. Queue it _now_, under the old context. 821 * below. Queue it _now_, under the old context.
820 */ 822 */
823 spin_lock(&realm->inodes_with_caps_lock);
821 list_del_init(&ci->i_snap_realm_item); 824 list_del_init(&ci->i_snap_realm_item);
825 spin_unlock(&realm->inodes_with_caps_lock);
822 spin_unlock(&inode->i_lock); 826 spin_unlock(&inode->i_lock);
823 827
824 ceph_queue_cap_snap(ci, 828 ceph_queue_cap_snap(ci);
825 ci->i_snap_realm->cached_context);
826 829
827 iput(inode); 830 iput(inode);
828 continue; 831 continue;
@@ -866,16 +869,20 @@ skip_inode:
866 continue; 869 continue;
867 ci = ceph_inode(inode); 870 ci = ceph_inode(inode);
868 spin_lock(&inode->i_lock); 871 spin_lock(&inode->i_lock);
869 if (!ci->i_snap_realm) 872 if (list_empty(&ci->i_snap_realm_item)) {
870 goto split_skip_inode; 873 struct ceph_snap_realm *oldrealm =
871 ceph_put_snap_realm(mdsc, ci->i_snap_realm); 874 ci->i_snap_realm;
872 spin_lock(&realm->inodes_with_caps_lock); 875
873 list_add(&ci->i_snap_realm_item, 876 dout(" moving %p to split realm %llx %p\n",
874 &realm->inodes_with_caps); 877 inode, realm->ino, realm);
875 ci->i_snap_realm = realm; 878 spin_lock(&realm->inodes_with_caps_lock);
876 spin_unlock(&realm->inodes_with_caps_lock); 879 list_add(&ci->i_snap_realm_item,
877 ceph_get_snap_realm(mdsc, realm); 880 &realm->inodes_with_caps);
878split_skip_inode: 881 ci->i_snap_realm = realm;
882 spin_unlock(&realm->inodes_with_caps_lock);
883 ceph_get_snap_realm(mdsc, realm);
884 ceph_put_snap_realm(mdsc, oldrealm);
885 }
879 spin_unlock(&inode->i_lock); 886 spin_unlock(&inode->i_lock);
880 iput(inode); 887 iput(inode);
881 } 888 }
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 4290a6e860b0..4e0bee240b9d 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -8,13 +8,11 @@
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/mount.h> 9#include <linux/mount.h>
10#include <linux/parser.h> 10#include <linux/parser.h>
11#include <linux/rwsem.h>
12#include <linux/sched.h> 11#include <linux/sched.h>
13#include <linux/seq_file.h> 12#include <linux/seq_file.h>
13#include <linux/slab.h>
14#include <linux/statfs.h> 14#include <linux/statfs.h>
15#include <linux/string.h> 15#include <linux/string.h>
16#include <linux/version.h>
17#include <linux/vmalloc.h>
18 16
19#include "decode.h" 17#include "decode.h"
20#include "super.h" 18#include "super.h"
@@ -46,10 +44,20 @@ const char *ceph_file_part(const char *s, int len)
46 */ 44 */
47static void ceph_put_super(struct super_block *s) 45static void ceph_put_super(struct super_block *s)
48{ 46{
49 struct ceph_client *cl = ceph_client(s); 47 struct ceph_client *client = ceph_sb_to_client(s);
50 48
51 dout("put_super\n"); 49 dout("put_super\n");
52 ceph_mdsc_close_sessions(&cl->mdsc); 50 ceph_mdsc_close_sessions(&client->mdsc);
51
52 /*
53 * ensure we release the bdi before put_anon_super releases
54 * the device name.
55 */
56 if (s->s_bdi == &client->backing_dev_info) {
57 bdi_unregister(&client->backing_dev_info);
58 s->s_bdi = NULL;
59 }
60
53 return; 61 return;
54} 62}
55 63
@@ -96,12 +104,40 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
96static int ceph_syncfs(struct super_block *sb, int wait) 104static int ceph_syncfs(struct super_block *sb, int wait)
97{ 105{
98 dout("sync_fs %d\n", wait); 106 dout("sync_fs %d\n", wait);
99 ceph_osdc_sync(&ceph_client(sb)->osdc); 107 ceph_osdc_sync(&ceph_sb_to_client(sb)->osdc);
100 ceph_mdsc_sync(&ceph_client(sb)->mdsc); 108 ceph_mdsc_sync(&ceph_sb_to_client(sb)->mdsc);
101 dout("sync_fs %d done\n", wait); 109 dout("sync_fs %d done\n", wait);
102 return 0; 110 return 0;
103} 111}
104 112
113static int default_congestion_kb(void)
114{
115 int congestion_kb;
116
117 /*
118 * Copied from NFS
119 *
120 * congestion size, scale with available memory.
121 *
122 * 64MB: 8192k
123 * 128MB: 11585k
124 * 256MB: 16384k
125 * 512MB: 23170k
126 * 1GB: 32768k
127 * 2GB: 46340k
128 * 4GB: 65536k
129 * 8GB: 92681k
130 * 16GB: 131072k
131 *
132 * This allows larger machines to have larger/more transfers.
133 * Limit the default to 256M
134 */
135 congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
136 if (congestion_kb > 256*1024)
137 congestion_kb = 256*1024;
138
139 return congestion_kb;
140}
105 141
106/** 142/**
107 * ceph_show_options - Show mount options in /proc/mounts 143 * ceph_show_options - Show mount options in /proc/mounts
@@ -127,6 +163,35 @@ static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
127 seq_puts(m, ",nocrc"); 163 seq_puts(m, ",nocrc");
128 if (args->flags & CEPH_OPT_NOASYNCREADDIR) 164 if (args->flags & CEPH_OPT_NOASYNCREADDIR)
129 seq_puts(m, ",noasyncreaddir"); 165 seq_puts(m, ",noasyncreaddir");
166
167 if (args->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
168 seq_printf(m, ",mount_timeout=%d", args->mount_timeout);
169 if (args->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
170 seq_printf(m, ",osd_idle_ttl=%d", args->osd_idle_ttl);
171 if (args->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT)
172 seq_printf(m, ",osdtimeout=%d", args->osd_timeout);
173 if (args->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
174 seq_printf(m, ",osdkeepalivetimeout=%d",
175 args->osd_keepalive_timeout);
176 if (args->wsize)
177 seq_printf(m, ",wsize=%d", args->wsize);
178 if (args->rsize != CEPH_MOUNT_RSIZE_DEFAULT)
179 seq_printf(m, ",rsize=%d", args->rsize);
180 if (args->congestion_kb != default_congestion_kb())
181 seq_printf(m, ",write_congestion_kb=%d", args->congestion_kb);
182 if (args->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
183 seq_printf(m, ",caps_wanted_delay_min=%d",
184 args->caps_wanted_delay_min);
185 if (args->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
186 seq_printf(m, ",caps_wanted_delay_max=%d",
187 args->caps_wanted_delay_max);
188 if (args->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT)
189 seq_printf(m, ",cap_release_safety=%d",
190 args->cap_release_safety);
191 if (args->max_readdir != CEPH_MAX_READDIR_DEFAULT)
192 seq_printf(m, ",readdir_max_entries=%d", args->max_readdir);
193 if (args->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
194 seq_printf(m, ",readdir_max_bytes=%d", args->max_readdir_bytes);
130 if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT)) 195 if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
131 seq_printf(m, ",snapdirname=%s", args->snapdir_name); 196 seq_printf(m, ",snapdirname=%s", args->snapdir_name);
132 if (args->name) 197 if (args->name)
@@ -150,35 +215,6 @@ static void ceph_inode_init_once(void *foo)
150 inode_init_once(&ci->vfs_inode); 215 inode_init_once(&ci->vfs_inode);
151} 216}
152 217
153static int default_congestion_kb(void)
154{
155 int congestion_kb;
156
157 /*
158 * Copied from NFS
159 *
160 * congestion size, scale with available memory.
161 *
162 * 64MB: 8192k
163 * 128MB: 11585k
164 * 256MB: 16384k
165 * 512MB: 23170k
166 * 1GB: 32768k
167 * 2GB: 46340k
168 * 4GB: 65536k
169 * 8GB: 92681k
170 * 16GB: 131072k
171 *
172 * This allows larger machines to have larger/more transfers.
173 * Limit the default to 256M
174 */
175 congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
176 if (congestion_kb > 256*1024)
177 congestion_kb = 256*1024;
178
179 return congestion_kb;
180}
181
182static int __init init_caches(void) 218static int __init init_caches(void)
183{ 219{
184 ceph_inode_cachep = kmem_cache_create("ceph_inode_info", 220 ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
@@ -297,7 +333,9 @@ enum {
297 Opt_osd_idle_ttl, 333 Opt_osd_idle_ttl,
298 Opt_caps_wanted_delay_min, 334 Opt_caps_wanted_delay_min,
299 Opt_caps_wanted_delay_max, 335 Opt_caps_wanted_delay_max,
336 Opt_cap_release_safety,
300 Opt_readdir_max_entries, 337 Opt_readdir_max_entries,
338 Opt_readdir_max_bytes,
301 Opt_congestion_kb, 339 Opt_congestion_kb,
302 Opt_last_int, 340 Opt_last_int,
303 /* int args above */ 341 /* int args above */
@@ -328,7 +366,9 @@ static match_table_t arg_tokens = {
328 {Opt_osd_idle_ttl, "osd_idle_ttl=%d"}, 366 {Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
329 {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"}, 367 {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
330 {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"}, 368 {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
369 {Opt_cap_release_safety, "cap_release_safety=%d"},
331 {Opt_readdir_max_entries, "readdir_max_entries=%d"}, 370 {Opt_readdir_max_entries, "readdir_max_entries=%d"},
371 {Opt_readdir_max_bytes, "readdir_max_bytes=%d"},
332 {Opt_congestion_kb, "write_congestion_kb=%d"}, 372 {Opt_congestion_kb, "write_congestion_kb=%d"},
333 /* int args above */ 373 /* int args above */
334 {Opt_snapdirname, "snapdirname=%s"}, 374 {Opt_snapdirname, "snapdirname=%s"},
@@ -377,8 +417,9 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options,
377 args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT; 417 args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
378 args->rsize = CEPH_MOUNT_RSIZE_DEFAULT; 418 args->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
379 args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); 419 args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
380 args->cap_release_safety = CEPH_CAPS_PER_RELEASE * 4; 420 args->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
381 args->max_readdir = 1024; 421 args->max_readdir = CEPH_MAX_READDIR_DEFAULT;
422 args->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
382 args->congestion_kb = default_congestion_kb(); 423 args->congestion_kb = default_congestion_kb();
383 424
384 /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */ 425 /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
@@ -486,6 +527,9 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options,
486 case Opt_readdir_max_entries: 527 case Opt_readdir_max_entries:
487 args->max_readdir = intval; 528 args->max_readdir = intval;
488 break; 529 break;
530 case Opt_readdir_max_bytes:
531 args->max_readdir_bytes = intval;
532 break;
489 case Opt_congestion_kb: 533 case Opt_congestion_kb:
490 args->congestion_kb = intval; 534 args->congestion_kb = intval;
491 break; 535 break;
@@ -625,9 +669,17 @@ static void ceph_destroy_client(struct ceph_client *client)
625 669
626 /* unmount */ 670 /* unmount */
627 ceph_mdsc_stop(&client->mdsc); 671 ceph_mdsc_stop(&client->mdsc);
628 ceph_monc_stop(&client->monc);
629 ceph_osdc_stop(&client->osdc); 672 ceph_osdc_stop(&client->osdc);
630 673
674 /*
675 * make sure mds and osd connections close out before destroying
676 * the auth module, which is needed to free those connections'
677 * ceph_authorizers.
678 */
679 ceph_msgr_flush();
680
681 ceph_monc_stop(&client->monc);
682
631 ceph_adjust_min_caps(-client->min_caps); 683 ceph_adjust_min_caps(-client->min_caps);
632 684
633 ceph_debugfs_client_cleanup(client); 685 ceph_debugfs_client_cleanup(client);
@@ -635,6 +687,8 @@ static void ceph_destroy_client(struct ceph_client *client)
635 destroy_workqueue(client->pg_inv_wq); 687 destroy_workqueue(client->pg_inv_wq);
636 destroy_workqueue(client->trunc_wq); 688 destroy_workqueue(client->trunc_wq);
637 689
690 bdi_destroy(&client->backing_dev_info);
691
638 if (client->msgr) 692 if (client->msgr)
639 ceph_messenger_destroy(client->msgr); 693 ceph_messenger_destroy(client->msgr);
640 mempool_destroy(client->wb_pagevec_pool); 694 mempool_destroy(client->wb_pagevec_pool);
@@ -669,9 +723,10 @@ int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
669/* 723/*
670 * true if we have the mon map (and have thus joined the cluster) 724 * true if we have the mon map (and have thus joined the cluster)
671 */ 725 */
672static int have_mon_map(struct ceph_client *client) 726static int have_mon_and_osd_map(struct ceph_client *client)
673{ 727{
674 return client->monc.monmap && client->monc.monmap->epoch; 728 return client->monc.monmap && client->monc.monmap->epoch &&
729 client->osdc.osdmap && client->osdc.osdmap->epoch;
675} 730}
676 731
677/* 732/*
@@ -691,7 +746,7 @@ static struct dentry *open_root_dentry(struct ceph_client *client,
691 dout("open_root_inode opening '%s'\n", path); 746 dout("open_root_inode opening '%s'\n", path);
692 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS); 747 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
693 if (IS_ERR(req)) 748 if (IS_ERR(req))
694 return ERR_PTR(PTR_ERR(req)); 749 return ERR_CAST(req);
695 req->r_path1 = kstrdup(path, GFP_NOFS); 750 req->r_path1 = kstrdup(path, GFP_NOFS);
696 req->r_ino1.ino = CEPH_INO_ROOT; 751 req->r_ino1.ino = CEPH_INO_ROOT;
697 req->r_ino1.snap = CEPH_NOSNAP; 752 req->r_ino1.snap = CEPH_NOSNAP;
@@ -749,7 +804,7 @@ static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
749 if (err < 0) 804 if (err < 0)
750 goto out; 805 goto out;
751 806
752 while (!have_mon_map(client)) { 807 while (!have_mon_and_osd_map(client)) {
753 err = -EIO; 808 err = -EIO;
754 if (timeout && time_after_eq(jiffies, started + timeout)) 809 if (timeout && time_after_eq(jiffies, started + timeout))
755 goto out; 810 goto out;
@@ -757,8 +812,8 @@ static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
757 /* wait */ 812 /* wait */
758 dout("mount waiting for mon_map\n"); 813 dout("mount waiting for mon_map\n");
759 err = wait_event_interruptible_timeout(client->auth_wq, 814 err = wait_event_interruptible_timeout(client->auth_wq,
760 have_mon_map(client) || (client->auth_err < 0), 815 have_mon_and_osd_map(client) || (client->auth_err < 0),
761 timeout); 816 timeout);
762 if (err == -EINTR || err == -ERESTARTSYS) 817 if (err == -EINTR || err == -ERESTARTSYS)
763 goto out; 818 goto out;
764 if (client->auth_err < 0) { 819 if (client->auth_err < 0) {
@@ -871,18 +926,21 @@ static int ceph_compare_super(struct super_block *sb, void *data)
871/* 926/*
872 * construct our own bdi so we can control readahead, etc. 927 * construct our own bdi so we can control readahead, etc.
873 */ 928 */
929static atomic_long_t bdi_seq = ATOMIC_INIT(0);
930
874static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client) 931static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
875{ 932{
876 int err; 933 int err;
877 934
878 sb->s_bdi = &client->backing_dev_info;
879
880 /* set ra_pages based on rsize mount option? */ 935 /* set ra_pages based on rsize mount option? */
881 if (client->mount_args->rsize >= PAGE_CACHE_SIZE) 936 if (client->mount_args->rsize >= PAGE_CACHE_SIZE)
882 client->backing_dev_info.ra_pages = 937 client->backing_dev_info.ra_pages =
883 (client->mount_args->rsize + PAGE_CACHE_SIZE - 1) 938 (client->mount_args->rsize + PAGE_CACHE_SIZE - 1)
884 >> PAGE_SHIFT; 939 >> PAGE_SHIFT;
885 err = bdi_register_dev(&client->backing_dev_info, sb->s_dev); 940 err = bdi_register(&client->backing_dev_info, NULL, "ceph-%d",
941 atomic_long_inc_return(&bdi_seq));
942 if (!err)
943 sb->s_bdi = &client->backing_dev_info;
886 return err; 944 return err;
887} 945}
888 946
@@ -919,9 +977,9 @@ static int ceph_get_sb(struct file_system_type *fs_type,
919 goto out; 977 goto out;
920 } 978 }
921 979
922 if (ceph_client(sb) != client) { 980 if (ceph_sb_to_client(sb) != client) {
923 ceph_destroy_client(client); 981 ceph_destroy_client(client);
924 client = ceph_client(sb); 982 client = ceph_sb_to_client(sb);
925 dout("get_sb got existing client %p\n", client); 983 dout("get_sb got existing client %p\n", client);
926 } else { 984 } else {
927 dout("get_sb using new client %p\n", client); 985 dout("get_sb using new client %p\n", client);
@@ -939,8 +997,7 @@ static int ceph_get_sb(struct file_system_type *fs_type,
939 997
940out_splat: 998out_splat:
941 ceph_mdsc_close_sessions(&client->mdsc); 999 ceph_mdsc_close_sessions(&client->mdsc);
942 up_write(&sb->s_umount); 1000 deactivate_locked_super(sb);
943 deactivate_super(sb);
944 goto out_final; 1001 goto out_final;
945 1002
946out: 1003out:
@@ -956,9 +1013,6 @@ static void ceph_kill_sb(struct super_block *s)
956 dout("kill_sb %p\n", s); 1013 dout("kill_sb %p\n", s);
957 ceph_mdsc_pre_umount(&client->mdsc); 1014 ceph_mdsc_pre_umount(&client->mdsc);
958 kill_anon_super(s); /* will call put_super after sb is r/o */ 1015 kill_anon_super(s); /* will call put_super after sb is r/o */
959 if (s->s_bdi == &client->backing_dev_info)
960 bdi_unregister(&client->backing_dev_info);
961 bdi_destroy(&client->backing_dev_info);
962 ceph_destroy_client(client); 1016 ceph_destroy_client(client);
963} 1017}
964 1018
@@ -995,9 +1049,10 @@ static int __init init_ceph(void)
995 if (ret) 1049 if (ret)
996 goto out_icache; 1050 goto out_icache;
997 1051
998 pr_info("loaded %d.%d.%d (mon/mds/osd proto %d/%d/%d)\n", 1052 pr_info("loaded (mon/mds/osd proto %d/%d/%d, osdmap %d/%d %d/%d)\n",
999 CEPH_VERSION_MAJOR, CEPH_VERSION_MINOR, CEPH_VERSION_PATCH, 1053 CEPH_MONC_PROTOCOL, CEPH_MDSC_PROTOCOL, CEPH_OSDC_PROTOCOL,
1000 CEPH_MONC_PROTOCOL, CEPH_MDSC_PROTOCOL, CEPH_OSDC_PROTOCOL); 1054 CEPH_OSDMAP_VERSION, CEPH_OSDMAP_VERSION_EXT,
1055 CEPH_OSDMAP_INC_VERSION, CEPH_OSDMAP_INC_VERSION_EXT);
1001 return 0; 1056 return 0;
1002 1057
1003out_icache: 1058out_icache:
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 65d12036b670..10a4a406e887 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -12,6 +12,7 @@
12#include <linux/pagemap.h> 12#include <linux/pagemap.h>
13#include <linux/wait.h> 13#include <linux/wait.h>
14#include <linux/writeback.h> 14#include <linux/writeback.h>
15#include <linux/slab.h>
15 16
16#include "types.h" 17#include "types.h"
17#include "messenger.h" 18#include "messenger.h"
@@ -50,24 +51,25 @@
50 51
51struct ceph_mount_args { 52struct ceph_mount_args {
52 int sb_flags; 53 int sb_flags;
54 int flags;
55 struct ceph_fsid fsid;
56 struct ceph_entity_addr my_addr;
53 int num_mon; 57 int num_mon;
54 struct ceph_entity_addr *mon_addr; 58 struct ceph_entity_addr *mon_addr;
55 int flags;
56 int mount_timeout; 59 int mount_timeout;
57 int osd_idle_ttl; 60 int osd_idle_ttl;
58 int caps_wanted_delay_min, caps_wanted_delay_max;
59 struct ceph_fsid fsid;
60 struct ceph_entity_addr my_addr;
61 int wsize;
62 int rsize; /* max readahead */
63 int max_readdir; /* max readdir size */
64 int congestion_kb; /* max readdir size */
65 int osd_timeout; 61 int osd_timeout;
66 int osd_keepalive_timeout; 62 int osd_keepalive_timeout;
63 int wsize;
64 int rsize; /* max readahead */
65 int congestion_kb; /* max writeback in flight */
66 int caps_wanted_delay_min, caps_wanted_delay_max;
67 int cap_release_safety;
68 int max_readdir; /* max readdir result (entires) */
69 int max_readdir_bytes; /* max readdir result (bytes) */
67 char *snapdir_name; /* default ".snap" */ 70 char *snapdir_name; /* default ".snap" */
68 char *name; 71 char *name;
69 char *secret; 72 char *secret;
70 int cap_release_safety;
71}; 73};
72 74
73/* 75/*
@@ -78,13 +80,14 @@ struct ceph_mount_args {
78#define CEPH_OSD_KEEPALIVE_DEFAULT 5 80#define CEPH_OSD_KEEPALIVE_DEFAULT 5
79#define CEPH_OSD_IDLE_TTL_DEFAULT 60 81#define CEPH_OSD_IDLE_TTL_DEFAULT 60
80#define CEPH_MOUNT_RSIZE_DEFAULT (512*1024) /* readahead */ 82#define CEPH_MOUNT_RSIZE_DEFAULT (512*1024) /* readahead */
83#define CEPH_MAX_READDIR_DEFAULT 1024
84#define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024)
81 85
82#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024) 86#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024)
83#define CEPH_MSG_MAX_DATA_LEN (16*1024*1024) 87#define CEPH_MSG_MAX_DATA_LEN (16*1024*1024)
84 88
85#define CEPH_SNAPDIRNAME_DEFAULT ".snap" 89#define CEPH_SNAPDIRNAME_DEFAULT ".snap"
86#define CEPH_AUTH_NAME_DEFAULT "guest" 90#define CEPH_AUTH_NAME_DEFAULT "guest"
87
88/* 91/*
89 * Delay telling the MDS we no longer want caps, in case we reopen 92 * Delay telling the MDS we no longer want caps, in case we reopen
90 * the file. Delay a minimum amount of time, even if we send a cap 93 * the file. Delay a minimum amount of time, even if we send a cap
@@ -94,6 +97,7 @@ struct ceph_mount_args {
94#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */ 97#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */
95#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */ 98#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */
96 99
100#define CEPH_CAP_RELEASE_SAFETY_DEFAULT (CEPH_CAPS_PER_RELEASE * 4)
97 101
98/* mount state */ 102/* mount state */
99enum { 103enum {
@@ -158,12 +162,6 @@ struct ceph_client {
158#endif 162#endif
159}; 163};
160 164
161static inline struct ceph_client *ceph_client(struct super_block *sb)
162{
163 return sb->s_fs_info;
164}
165
166
167/* 165/*
168 * File i/o capability. This tracks shared state with the metadata 166 * File i/o capability. This tracks shared state with the metadata
169 * server that allows us to cache or writeback attributes or to read 167 * server that allows us to cache or writeback attributes or to read
@@ -714,8 +712,7 @@ extern int ceph_update_snap_trace(struct ceph_mds_client *m,
714extern void ceph_handle_snap(struct ceph_mds_client *mdsc, 712extern void ceph_handle_snap(struct ceph_mds_client *mdsc,
715 struct ceph_mds_session *session, 713 struct ceph_mds_session *session,
716 struct ceph_msg *msg); 714 struct ceph_msg *msg);
717extern void ceph_queue_cap_snap(struct ceph_inode_info *ci, 715extern void ceph_queue_cap_snap(struct ceph_inode_info *ci);
718 struct ceph_snap_context *snapc);
719extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci, 716extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
720 struct ceph_cap_snap *capsnap); 717 struct ceph_cap_snap *capsnap);
721extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc); 718extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc);
@@ -813,7 +810,7 @@ extern void ceph_put_cap(struct ceph_cap *cap);
813 810
814extern void ceph_queue_caps_release(struct inode *inode); 811extern void ceph_queue_caps_release(struct inode *inode);
815extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc); 812extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc);
816extern int ceph_fsync(struct file *file, struct dentry *dentry, int datasync); 813extern int ceph_fsync(struct file *file, int datasync);
817extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, 814extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
818 struct ceph_mds_session *session); 815 struct ceph_mds_session *session);
819extern int ceph_get_cap_mds(struct inode *inode); 816extern int ceph_get_cap_mds(struct inode *inode);
@@ -870,6 +867,7 @@ extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
870extern void ceph_dentry_lru_add(struct dentry *dn); 867extern void ceph_dentry_lru_add(struct dentry *dn);
871extern void ceph_dentry_lru_touch(struct dentry *dn); 868extern void ceph_dentry_lru_touch(struct dentry *dn);
872extern void ceph_dentry_lru_del(struct dentry *dn); 869extern void ceph_dentry_lru_del(struct dentry *dn);
870extern void ceph_invalidate_dentry_lease(struct dentry *dentry);
873 871
874/* 872/*
875 * our d_ops vary depending on whether the inode is live, 873 * our d_ops vary depending on whether the inode is live,
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 37d6ce645691..68aeebc69681 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -3,10 +3,12 @@
3#include "decode.h" 3#include "decode.h"
4 4
5#include <linux/xattr.h> 5#include <linux/xattr.h>
6#include <linux/slab.h>
6 7
7static bool ceph_is_valid_xattr(const char *name) 8static bool ceph_is_valid_xattr(const char *name)
8{ 9{
9 return !strncmp(name, XATTR_SECURITY_PREFIX, 10 return !strncmp(name, "ceph.", 5) ||
11 !strncmp(name, XATTR_SECURITY_PREFIX,
10 XATTR_SECURITY_PREFIX_LEN) || 12 XATTR_SECURITY_PREFIX_LEN) ||
11 !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || 13 !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
12 !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); 14 !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
@@ -75,14 +77,14 @@ static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val,
75} 77}
76 78
77static struct ceph_vxattr_cb ceph_dir_vxattrs[] = { 79static struct ceph_vxattr_cb ceph_dir_vxattrs[] = {
78 { true, "user.ceph.dir.entries", ceph_vxattrcb_entries}, 80 { true, "ceph.dir.entries", ceph_vxattrcb_entries},
79 { true, "user.ceph.dir.files", ceph_vxattrcb_files}, 81 { true, "ceph.dir.files", ceph_vxattrcb_files},
80 { true, "user.ceph.dir.subdirs", ceph_vxattrcb_subdirs}, 82 { true, "ceph.dir.subdirs", ceph_vxattrcb_subdirs},
81 { true, "user.ceph.dir.rentries", ceph_vxattrcb_rentries}, 83 { true, "ceph.dir.rentries", ceph_vxattrcb_rentries},
82 { true, "user.ceph.dir.rfiles", ceph_vxattrcb_rfiles}, 84 { true, "ceph.dir.rfiles", ceph_vxattrcb_rfiles},
83 { true, "user.ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs}, 85 { true, "ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs},
84 { true, "user.ceph.dir.rbytes", ceph_vxattrcb_rbytes}, 86 { true, "ceph.dir.rbytes", ceph_vxattrcb_rbytes},
85 { true, "user.ceph.dir.rctime", ceph_vxattrcb_rctime}, 87 { true, "ceph.dir.rctime", ceph_vxattrcb_rctime},
86 { true, NULL, NULL } 88 { true, NULL, NULL }
87}; 89};
88 90
@@ -106,7 +108,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
106} 108}
107 109
108static struct ceph_vxattr_cb ceph_file_vxattrs[] = { 110static struct ceph_vxattr_cb ceph_file_vxattrs[] = {
109 { true, "user.ceph.layout", ceph_vxattrcb_layout}, 111 { true, "ceph.layout", ceph_vxattrcb_layout},
110 { NULL, NULL } 112 { NULL, NULL }
111}; 113};
112 114
@@ -185,12 +187,6 @@ static int __set_xattr(struct ceph_inode_info *ci,
185 ci->i_xattrs.names_size -= xattr->name_len; 187 ci->i_xattrs.names_size -= xattr->name_len;
186 ci->i_xattrs.vals_size -= xattr->val_len; 188 ci->i_xattrs.vals_size -= xattr->val_len;
187 } 189 }
188 if (!xattr) {
189 pr_err("__set_xattr ENOMEM on %p %llx.%llx xattr %s=%s\n",
190 &ci->vfs_inode, ceph_vinop(&ci->vfs_inode), name,
191 xattr->val);
192 return -ENOMEM;
193 }
194 ci->i_xattrs.names_size += name_len; 190 ci->i_xattrs.names_size += name_len;
195 ci->i_xattrs.vals_size += val_len; 191 ci->i_xattrs.vals_size += val_len;
196 if (val) 192 if (val)
@@ -573,7 +569,7 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
573 ci->i_xattrs.version, ci->i_xattrs.index_version); 569 ci->i_xattrs.version, ci->i_xattrs.index_version);
574 570
575 if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) && 571 if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
576 (ci->i_xattrs.index_version > ci->i_xattrs.version)) { 572 (ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
577 goto list_xattr; 573 goto list_xattr;
578 } else { 574 } else {
579 spin_unlock(&inode->i_lock); 575 spin_unlock(&inode->i_lock);
@@ -621,7 +617,7 @@ out:
621static int ceph_sync_setxattr(struct dentry *dentry, const char *name, 617static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
622 const char *value, size_t size, int flags) 618 const char *value, size_t size, int flags)
623{ 619{
624 struct ceph_client *client = ceph_client(dentry->d_sb); 620 struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
625 struct inode *inode = dentry->d_inode; 621 struct inode *inode = dentry->d_inode;
626 struct ceph_inode_info *ci = ceph_inode(inode); 622 struct ceph_inode_info *ci = ceph_inode(inode);
627 struct inode *parent_inode = dentry->d_parent->d_inode; 623 struct inode *parent_inode = dentry->d_parent->d_inode;
@@ -640,7 +636,7 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
640 return -ENOMEM; 636 return -ENOMEM;
641 err = -ENOMEM; 637 err = -ENOMEM;
642 for (i = 0; i < nr_pages; i++) { 638 for (i = 0; i < nr_pages; i++) {
643 pages[i] = alloc_page(GFP_NOFS); 639 pages[i] = __page_cache_alloc(GFP_NOFS);
644 if (!pages[i]) { 640 if (!pages[i]) {
645 nr_pages = i; 641 nr_pages = i;
646 goto out; 642 goto out;
@@ -778,7 +774,7 @@ out:
778 774
779static int ceph_send_removexattr(struct dentry *dentry, const char *name) 775static int ceph_send_removexattr(struct dentry *dentry, const char *name)
780{ 776{
781 struct ceph_client *client = ceph_client(dentry->d_sb); 777 struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
782 struct ceph_mds_client *mdsc = &client->mdsc; 778 struct ceph_mds_client *mdsc = &client->mdsc;
783 struct inode *inode = dentry->d_inode; 779 struct inode *inode = dentry->d_inode;
784 struct inode *parent_inode = dentry->d_parent->d_inode; 780 struct inode *parent_inode = dentry->d_parent->d_inode;
diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c
index a20bea598933..cfd1ce34e0bc 100644
--- a/fs/cifs/asn1.c
+++ b/fs/cifs/asn1.c
@@ -492,17 +492,13 @@ compare_oid(unsigned long *oid1, unsigned int oid1len,
492 492
493int 493int
494decode_negTokenInit(unsigned char *security_blob, int length, 494decode_negTokenInit(unsigned char *security_blob, int length,
495 enum securityEnum *secType) 495 struct TCP_Server_Info *server)
496{ 496{
497 struct asn1_ctx ctx; 497 struct asn1_ctx ctx;
498 unsigned char *end; 498 unsigned char *end;
499 unsigned char *sequence_end; 499 unsigned char *sequence_end;
500 unsigned long *oid = NULL; 500 unsigned long *oid = NULL;
501 unsigned int cls, con, tag, oidlen, rc; 501 unsigned int cls, con, tag, oidlen, rc;
502 bool use_ntlmssp = false;
503 bool use_kerberos = false;
504 bool use_kerberosu2u = false;
505 bool use_mskerberos = false;
506 502
507 /* cifs_dump_mem(" Received SecBlob ", security_blob, length); */ 503 /* cifs_dump_mem(" Received SecBlob ", security_blob, length); */
508 504
@@ -510,11 +506,11 @@ decode_negTokenInit(unsigned char *security_blob, int length,
510 506
511 /* GSSAPI header */ 507 /* GSSAPI header */
512 if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { 508 if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
513 cFYI(1, ("Error decoding negTokenInit header")); 509 cFYI(1, "Error decoding negTokenInit header");
514 return 0; 510 return 0;
515 } else if ((cls != ASN1_APL) || (con != ASN1_CON) 511 } else if ((cls != ASN1_APL) || (con != ASN1_CON)
516 || (tag != ASN1_EOC)) { 512 || (tag != ASN1_EOC)) {
517 cFYI(1, ("cls = %d con = %d tag = %d", cls, con, tag)); 513 cFYI(1, "cls = %d con = %d tag = %d", cls, con, tag);
518 return 0; 514 return 0;
519 } 515 }
520 516
@@ -535,56 +531,52 @@ decode_negTokenInit(unsigned char *security_blob, int length,
535 531
536 /* SPNEGO OID not present or garbled -- bail out */ 532 /* SPNEGO OID not present or garbled -- bail out */
537 if (!rc) { 533 if (!rc) {
538 cFYI(1, ("Error decoding negTokenInit header")); 534 cFYI(1, "Error decoding negTokenInit header");
539 return 0; 535 return 0;
540 } 536 }
541 537
542 /* SPNEGO */ 538 /* SPNEGO */
543 if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { 539 if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
544 cFYI(1, ("Error decoding negTokenInit")); 540 cFYI(1, "Error decoding negTokenInit");
545 return 0; 541 return 0;
546 } else if ((cls != ASN1_CTX) || (con != ASN1_CON) 542 } else if ((cls != ASN1_CTX) || (con != ASN1_CON)
547 || (tag != ASN1_EOC)) { 543 || (tag != ASN1_EOC)) {
548 cFYI(1, 544 cFYI(1, "cls = %d con = %d tag = %d end = %p (%d) exit 0",
549 ("cls = %d con = %d tag = %d end = %p (%d) exit 0", 545 cls, con, tag, end, *end);
550 cls, con, tag, end, *end));
551 return 0; 546 return 0;
552 } 547 }
553 548
554 /* negTokenInit */ 549 /* negTokenInit */
555 if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { 550 if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
556 cFYI(1, ("Error decoding negTokenInit")); 551 cFYI(1, "Error decoding negTokenInit");
557 return 0; 552 return 0;
558 } else if ((cls != ASN1_UNI) || (con != ASN1_CON) 553 } else if ((cls != ASN1_UNI) || (con != ASN1_CON)
559 || (tag != ASN1_SEQ)) { 554 || (tag != ASN1_SEQ)) {
560 cFYI(1, 555 cFYI(1, "cls = %d con = %d tag = %d end = %p (%d) exit 1",
561 ("cls = %d con = %d tag = %d end = %p (%d) exit 1", 556 cls, con, tag, end, *end);
562 cls, con, tag, end, *end));
563 return 0; 557 return 0;
564 } 558 }
565 559
566 /* sequence */ 560 /* sequence */
567 if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { 561 if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
568 cFYI(1, ("Error decoding 2nd part of negTokenInit")); 562 cFYI(1, "Error decoding 2nd part of negTokenInit");
569 return 0; 563 return 0;
570 } else if ((cls != ASN1_CTX) || (con != ASN1_CON) 564 } else if ((cls != ASN1_CTX) || (con != ASN1_CON)
571 || (tag != ASN1_EOC)) { 565 || (tag != ASN1_EOC)) {
572 cFYI(1, 566 cFYI(1, "cls = %d con = %d tag = %d end = %p (%d) exit 0",
573 ("cls = %d con = %d tag = %d end = %p (%d) exit 0", 567 cls, con, tag, end, *end);
574 cls, con, tag, end, *end));
575 return 0; 568 return 0;
576 } 569 }
577 570
578 /* sequence of */ 571 /* sequence of */
579 if (asn1_header_decode 572 if (asn1_header_decode
580 (&ctx, &sequence_end, &cls, &con, &tag) == 0) { 573 (&ctx, &sequence_end, &cls, &con, &tag) == 0) {
581 cFYI(1, ("Error decoding 2nd part of negTokenInit")); 574 cFYI(1, "Error decoding 2nd part of negTokenInit");
582 return 0; 575 return 0;
583 } else if ((cls != ASN1_UNI) || (con != ASN1_CON) 576 } else if ((cls != ASN1_UNI) || (con != ASN1_CON)
584 || (tag != ASN1_SEQ)) { 577 || (tag != ASN1_SEQ)) {
585 cFYI(1, 578 cFYI(1, "cls = %d con = %d tag = %d end = %p (%d) exit 1",
586 ("cls = %d con = %d tag = %d end = %p (%d) exit 1", 579 cls, con, tag, end, *end);
587 cls, con, tag, end, *end));
588 return 0; 580 return 0;
589 } 581 }
590 582
@@ -592,37 +584,33 @@ decode_negTokenInit(unsigned char *security_blob, int length,
592 while (!asn1_eoc_decode(&ctx, sequence_end)) { 584 while (!asn1_eoc_decode(&ctx, sequence_end)) {
593 rc = asn1_header_decode(&ctx, &end, &cls, &con, &tag); 585 rc = asn1_header_decode(&ctx, &end, &cls, &con, &tag);
594 if (!rc) { 586 if (!rc) {
595 cFYI(1, 587 cFYI(1, "Error decoding negTokenInit hdr exit2");
596 ("Error decoding negTokenInit hdr exit2"));
597 return 0; 588 return 0;
598 } 589 }
599 if ((tag == ASN1_OJI) && (con == ASN1_PRI)) { 590 if ((tag == ASN1_OJI) && (con == ASN1_PRI)) {
600 if (asn1_oid_decode(&ctx, end, &oid, &oidlen)) { 591 if (asn1_oid_decode(&ctx, end, &oid, &oidlen)) {
601 592
602 cFYI(1, ("OID len = %d oid = 0x%lx 0x%lx " 593 cFYI(1, "OID len = %d oid = 0x%lx 0x%lx "
603 "0x%lx 0x%lx", oidlen, *oid, 594 "0x%lx 0x%lx", oidlen, *oid,
604 *(oid + 1), *(oid + 2), *(oid + 3))); 595 *(oid + 1), *(oid + 2), *(oid + 3));
605 596
606 if (compare_oid(oid, oidlen, MSKRB5_OID, 597 if (compare_oid(oid, oidlen, MSKRB5_OID,
607 MSKRB5_OID_LEN) && 598 MSKRB5_OID_LEN))
608 !use_mskerberos) 599 server->sec_mskerberos = true;
609 use_mskerberos = true;
610 else if (compare_oid(oid, oidlen, KRB5U2U_OID, 600 else if (compare_oid(oid, oidlen, KRB5U2U_OID,
611 KRB5U2U_OID_LEN) && 601 KRB5U2U_OID_LEN))
612 !use_kerberosu2u) 602 server->sec_kerberosu2u = true;
613 use_kerberosu2u = true;
614 else if (compare_oid(oid, oidlen, KRB5_OID, 603 else if (compare_oid(oid, oidlen, KRB5_OID,
615 KRB5_OID_LEN) && 604 KRB5_OID_LEN))
616 !use_kerberos) 605 server->sec_kerberos = true;
617 use_kerberos = true;
618 else if (compare_oid(oid, oidlen, NTLMSSP_OID, 606 else if (compare_oid(oid, oidlen, NTLMSSP_OID,
619 NTLMSSP_OID_LEN)) 607 NTLMSSP_OID_LEN))
620 use_ntlmssp = true; 608 server->sec_ntlmssp = true;
621 609
622 kfree(oid); 610 kfree(oid);
623 } 611 }
624 } else { 612 } else {
625 cFYI(1, ("Should be an oid what is going on?")); 613 cFYI(1, "Should be an oid what is going on?");
626 } 614 }
627 } 615 }
628 616
@@ -632,54 +620,47 @@ decode_negTokenInit(unsigned char *security_blob, int length,
632 no mechListMic (e.g. NTLMSSP instead of KRB5) */ 620 no mechListMic (e.g. NTLMSSP instead of KRB5) */
633 if (ctx.error == ASN1_ERR_DEC_EMPTY) 621 if (ctx.error == ASN1_ERR_DEC_EMPTY)
634 goto decode_negtoken_exit; 622 goto decode_negtoken_exit;
635 cFYI(1, ("Error decoding last part negTokenInit exit3")); 623 cFYI(1, "Error decoding last part negTokenInit exit3");
636 return 0; 624 return 0;
637 } else if ((cls != ASN1_CTX) || (con != ASN1_CON)) { 625 } else if ((cls != ASN1_CTX) || (con != ASN1_CON)) {
638 /* tag = 3 indicating mechListMIC */ 626 /* tag = 3 indicating mechListMIC */
639 cFYI(1, ("Exit 4 cls = %d con = %d tag = %d end = %p (%d)", 627 cFYI(1, "Exit 4 cls = %d con = %d tag = %d end = %p (%d)",
640 cls, con, tag, end, *end)); 628 cls, con, tag, end, *end);
641 return 0; 629 return 0;
642 } 630 }
643 631
644 /* sequence */ 632 /* sequence */
645 if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { 633 if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
646 cFYI(1, ("Error decoding last part negTokenInit exit5")); 634 cFYI(1, "Error decoding last part negTokenInit exit5");
647 return 0; 635 return 0;
648 } else if ((cls != ASN1_UNI) || (con != ASN1_CON) 636 } else if ((cls != ASN1_UNI) || (con != ASN1_CON)
649 || (tag != ASN1_SEQ)) { 637 || (tag != ASN1_SEQ)) {
650 cFYI(1, ("cls = %d con = %d tag = %d end = %p (%d)", 638 cFYI(1, "cls = %d con = %d tag = %d end = %p (%d)",
651 cls, con, tag, end, *end)); 639 cls, con, tag, end, *end);
652 } 640 }
653 641
654 /* sequence of */ 642 /* sequence of */
655 if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { 643 if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
656 cFYI(1, ("Error decoding last part negTokenInit exit 7")); 644 cFYI(1, "Error decoding last part negTokenInit exit 7");
657 return 0; 645 return 0;
658 } else if ((cls != ASN1_CTX) || (con != ASN1_CON)) { 646 } else if ((cls != ASN1_CTX) || (con != ASN1_CON)) {
659 cFYI(1, ("Exit 8 cls = %d con = %d tag = %d end = %p (%d)", 647 cFYI(1, "Exit 8 cls = %d con = %d tag = %d end = %p (%d)",
660 cls, con, tag, end, *end)); 648 cls, con, tag, end, *end);
661 return 0; 649 return 0;
662 } 650 }
663 651
664 /* general string */ 652 /* general string */
665 if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { 653 if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
666 cFYI(1, ("Error decoding last part negTokenInit exit9")); 654 cFYI(1, "Error decoding last part negTokenInit exit9");
667 return 0; 655 return 0;
668 } else if ((cls != ASN1_UNI) || (con != ASN1_PRI) 656 } else if ((cls != ASN1_UNI) || (con != ASN1_PRI)
669 || (tag != ASN1_GENSTR)) { 657 || (tag != ASN1_GENSTR)) {
670 cFYI(1, ("Exit10 cls = %d con = %d tag = %d end = %p (%d)", 658 cFYI(1, "Exit10 cls = %d con = %d tag = %d end = %p (%d)",
671 cls, con, tag, end, *end)); 659 cls, con, tag, end, *end);
672 return 0; 660 return 0;
673 } 661 }
674 cFYI(1, ("Need to call asn1_octets_decode() function for %s", 662 cFYI(1, "Need to call asn1_octets_decode() function for %s",
675 ctx.pointer)); /* is this UTF-8 or ASCII? */ 663 ctx.pointer); /* is this UTF-8 or ASCII? */
676decode_negtoken_exit: 664decode_negtoken_exit:
677 if (use_kerberos)
678 *secType = Kerberos;
679 else if (use_mskerberos)
680 *secType = MSKerberos;
681 else if (use_ntlmssp)
682 *secType = RawNTLMSSP;
683
684 return 1; 665 return 1;
685} 666}
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 42cec2a7c0cf..4fce6e61b34e 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -60,10 +60,10 @@ cifs_dump_mem(char *label, void *data, int length)
60#ifdef CONFIG_CIFS_DEBUG2 60#ifdef CONFIG_CIFS_DEBUG2
61void cifs_dump_detail(struct smb_hdr *smb) 61void cifs_dump_detail(struct smb_hdr *smb)
62{ 62{
63 cERROR(1, ("Cmd: %d Err: 0x%x Flags: 0x%x Flgs2: 0x%x Mid: %d Pid: %d", 63 cERROR(1, "Cmd: %d Err: 0x%x Flags: 0x%x Flgs2: 0x%x Mid: %d Pid: %d",
64 smb->Command, smb->Status.CifsError, 64 smb->Command, smb->Status.CifsError,
65 smb->Flags, smb->Flags2, smb->Mid, smb->Pid)); 65 smb->Flags, smb->Flags2, smb->Mid, smb->Pid);
66 cERROR(1, ("smb buf %p len %d", smb, smbCalcSize_LE(smb))); 66 cERROR(1, "smb buf %p len %d", smb, smbCalcSize_LE(smb));
67} 67}
68 68
69 69
@@ -75,25 +75,25 @@ void cifs_dump_mids(struct TCP_Server_Info *server)
75 if (server == NULL) 75 if (server == NULL)
76 return; 76 return;
77 77
78 cERROR(1, ("Dump pending requests:")); 78 cERROR(1, "Dump pending requests:");
79 spin_lock(&GlobalMid_Lock); 79 spin_lock(&GlobalMid_Lock);
80 list_for_each(tmp, &server->pending_mid_q) { 80 list_for_each(tmp, &server->pending_mid_q) {
81 mid_entry = list_entry(tmp, struct mid_q_entry, qhead); 81 mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
82 cERROR(1, ("State: %d Cmd: %d Pid: %d Tsk: %p Mid %d", 82 cERROR(1, "State: %d Cmd: %d Pid: %d Tsk: %p Mid %d",
83 mid_entry->midState, 83 mid_entry->midState,
84 (int)mid_entry->command, 84 (int)mid_entry->command,
85 mid_entry->pid, 85 mid_entry->pid,
86 mid_entry->tsk, 86 mid_entry->tsk,
87 mid_entry->mid)); 87 mid_entry->mid);
88#ifdef CONFIG_CIFS_STATS2 88#ifdef CONFIG_CIFS_STATS2
89 cERROR(1, ("IsLarge: %d buf: %p time rcv: %ld now: %ld", 89 cERROR(1, "IsLarge: %d buf: %p time rcv: %ld now: %ld",
90 mid_entry->largeBuf, 90 mid_entry->largeBuf,
91 mid_entry->resp_buf, 91 mid_entry->resp_buf,
92 mid_entry->when_received, 92 mid_entry->when_received,
93 jiffies)); 93 jiffies);
94#endif /* STATS2 */ 94#endif /* STATS2 */
95 cERROR(1, ("IsMult: %d IsEnd: %d", mid_entry->multiRsp, 95 cERROR(1, "IsMult: %d IsEnd: %d", mid_entry->multiRsp,
96 mid_entry->multiEnd)); 96 mid_entry->multiEnd);
97 if (mid_entry->resp_buf) { 97 if (mid_entry->resp_buf) {
98 cifs_dump_detail(mid_entry->resp_buf); 98 cifs_dump_detail(mid_entry->resp_buf);
99 cifs_dump_mem("existing buf: ", 99 cifs_dump_mem("existing buf: ",
@@ -716,7 +716,7 @@ static const struct file_operations cifs_multiuser_mount_proc_fops = {
716 716
717static int cifs_security_flags_proc_show(struct seq_file *m, void *v) 717static int cifs_security_flags_proc_show(struct seq_file *m, void *v)
718{ 718{
719 seq_printf(m, "0x%x\n", extended_security); 719 seq_printf(m, "0x%x\n", global_secflags);
720 return 0; 720 return 0;
721} 721}
722 722
@@ -744,13 +744,13 @@ static ssize_t cifs_security_flags_proc_write(struct file *file,
744 /* single char or single char followed by null */ 744 /* single char or single char followed by null */
745 c = flags_string[0]; 745 c = flags_string[0];
746 if (c == '0' || c == 'n' || c == 'N') { 746 if (c == '0' || c == 'n' || c == 'N') {
747 extended_security = CIFSSEC_DEF; /* default */ 747 global_secflags = CIFSSEC_DEF; /* default */
748 return count; 748 return count;
749 } else if (c == '1' || c == 'y' || c == 'Y') { 749 } else if (c == '1' || c == 'y' || c == 'Y') {
750 extended_security = CIFSSEC_MAX; 750 global_secflags = CIFSSEC_MAX;
751 return count; 751 return count;
752 } else if (!isdigit(c)) { 752 } else if (!isdigit(c)) {
753 cERROR(1, ("invalid flag %c", c)); 753 cERROR(1, "invalid flag %c", c);
754 return -EINVAL; 754 return -EINVAL;
755 } 755 }
756 } 756 }
@@ -758,26 +758,26 @@ static ssize_t cifs_security_flags_proc_write(struct file *file,
758 758
759 flags = simple_strtoul(flags_string, NULL, 0); 759 flags = simple_strtoul(flags_string, NULL, 0);
760 760
761 cFYI(1, ("sec flags 0x%x", flags)); 761 cFYI(1, "sec flags 0x%x", flags);
762 762
763 if (flags <= 0) { 763 if (flags <= 0) {
764 cERROR(1, ("invalid security flags %s", flags_string)); 764 cERROR(1, "invalid security flags %s", flags_string);
765 return -EINVAL; 765 return -EINVAL;
766 } 766 }
767 767
768 if (flags & ~CIFSSEC_MASK) { 768 if (flags & ~CIFSSEC_MASK) {
769 cERROR(1, ("attempt to set unsupported security flags 0x%x", 769 cERROR(1, "attempt to set unsupported security flags 0x%x",
770 flags & ~CIFSSEC_MASK)); 770 flags & ~CIFSSEC_MASK);
771 return -EINVAL; 771 return -EINVAL;
772 } 772 }
773 /* flags look ok - update the global security flags for cifs module */ 773 /* flags look ok - update the global security flags for cifs module */
774 extended_security = flags; 774 global_secflags = flags;
775 if (extended_security & CIFSSEC_MUST_SIGN) { 775 if (global_secflags & CIFSSEC_MUST_SIGN) {
776 /* requiring signing implies signing is allowed */ 776 /* requiring signing implies signing is allowed */
777 extended_security |= CIFSSEC_MAY_SIGN; 777 global_secflags |= CIFSSEC_MAY_SIGN;
778 cFYI(1, ("packet signing now required")); 778 cFYI(1, "packet signing now required");
779 } else if ((extended_security & CIFSSEC_MAY_SIGN) == 0) { 779 } else if ((global_secflags & CIFSSEC_MAY_SIGN) == 0) {
780 cFYI(1, ("packet signing disabled")); 780 cFYI(1, "packet signing disabled");
781 } 781 }
782 /* BB should we turn on MAY flags for other MUST options? */ 782 /* BB should we turn on MAY flags for other MUST options? */
783 return count; 783 return count;
diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h
index 5eb3b83bbfa7..aa316891ac0c 100644
--- a/fs/cifs/cifs_debug.h
+++ b/fs/cifs/cifs_debug.h
@@ -43,34 +43,54 @@ void dump_smb(struct smb_hdr *, int);
43 */ 43 */
44#ifdef CIFS_DEBUG 44#ifdef CIFS_DEBUG
45 45
46
47/* information message: e.g., configuration, major event */ 46/* information message: e.g., configuration, major event */
48extern int cifsFYI; 47extern int cifsFYI;
49#define cifsfyi(format,arg...) if (cifsFYI & CIFS_INFO) printk(KERN_DEBUG " " __FILE__ ": " format "\n" "" , ## arg) 48#define cifsfyi(fmt, arg...) \
49do { \
50 if (cifsFYI & CIFS_INFO) \
51 printk(KERN_DEBUG "%s: " fmt "\n", __FILE__, ##arg); \
52} while (0)
50 53
51#define cFYI(button,prspec) if (button) cifsfyi prspec 54#define cFYI(set, fmt, arg...) \
55do { \
56 if (set) \
57 cifsfyi(fmt, ##arg); \
58} while (0)
52 59
53#define cifswarn(format, arg...) printk(KERN_WARNING ": " format "\n" , ## arg) 60#define cifswarn(fmt, arg...) \
61 printk(KERN_WARNING fmt "\n", ##arg)
54 62
55/* debug event message: */ 63/* debug event message: */
56extern int cifsERROR; 64extern int cifsERROR;
57 65
58#define cEVENT(format,arg...) if (cifsERROR) printk(KERN_EVENT __FILE__ ": " format "\n" , ## arg) 66#define cEVENT(fmt, arg...) \
67do { \
68 if (cifsERROR) \
69 printk(KERN_EVENT "%s: " fmt "\n", __FILE__, ##arg); \
70} while (0)
59 71
60/* error event message: e.g., i/o error */ 72/* error event message: e.g., i/o error */
61#define cifserror(format,arg...) if (cifsERROR) printk(KERN_ERR " CIFS VFS: " format "\n" "" , ## arg) 73#define cifserror(fmt, arg...) \
74do { \
75 if (cifsERROR) \
76 printk(KERN_ERR "CIFS VFS: " fmt "\n", ##arg); \
77} while (0)
62 78
63#define cERROR(button, prspec) if (button) cifserror prspec 79#define cERROR(set, fmt, arg...) \
80do { \
81 if (set) \
82 cifserror(fmt, ##arg); \
83} while (0)
64 84
65/* 85/*
66 * debug OFF 86 * debug OFF
67 * --------- 87 * ---------
68 */ 88 */
69#else /* _CIFS_DEBUG */ 89#else /* _CIFS_DEBUG */
70#define cERROR(button, prspec) 90#define cERROR(set, fmt, arg...)
71#define cEVENT(format, arg...) 91#define cEVENT(fmt, arg...)
72#define cFYI(button, prspec) 92#define cFYI(set, fmt, arg...)
73#define cifserror(format, arg...) 93#define cifserror(fmt, arg...)
74#endif /* _CIFS_DEBUG */ 94#endif /* _CIFS_DEBUG */
75 95
76#endif /* _H_CIFS_DEBUG */ 96#endif /* _H_CIFS_DEBUG */
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index b1d61d0bdfc7..ac19a6f3dae0 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -15,6 +15,7 @@
15#include <linux/dcache.h> 15#include <linux/dcache.h>
16#include <linux/mount.h> 16#include <linux/mount.h>
17#include <linux/namei.h> 17#include <linux/namei.h>
18#include <linux/slab.h>
18#include <linux/vfs.h> 19#include <linux/vfs.h>
19#include <linux/fs.h> 20#include <linux/fs.h>
20#include "cifsglob.h" 21#include "cifsglob.h"
@@ -84,8 +85,8 @@ static char *cifs_get_share_name(const char *node_name)
84 /* find server name end */ 85 /* find server name end */
85 pSep = memchr(UNC+2, '\\', len-2); 86 pSep = memchr(UNC+2, '\\', len-2);
86 if (!pSep) { 87 if (!pSep) {
87 cERROR(1, ("%s: no server name end in node name: %s", 88 cERROR(1, "%s: no server name end in node name: %s",
88 __func__, node_name)); 89 __func__, node_name);
89 kfree(UNC); 90 kfree(UNC);
90 return ERR_PTR(-EINVAL); 91 return ERR_PTR(-EINVAL);
91 } 92 }
@@ -141,8 +142,8 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
141 142
142 rc = dns_resolve_server_name_to_ip(*devname, &srvIP); 143 rc = dns_resolve_server_name_to_ip(*devname, &srvIP);
143 if (rc != 0) { 144 if (rc != 0) {
144 cERROR(1, ("%s: Failed to resolve server part of %s to IP: %d", 145 cERROR(1, "%s: Failed to resolve server part of %s to IP: %d",
145 __func__, *devname, rc)); 146 __func__, *devname, rc);
146 goto compose_mount_options_err; 147 goto compose_mount_options_err;
147 } 148 }
148 /* md_len = strlen(...) + 12 for 'sep+prefixpath=' 149 /* md_len = strlen(...) + 12 for 'sep+prefixpath='
@@ -216,8 +217,8 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
216 strcat(mountdata, fullpath + ref->path_consumed); 217 strcat(mountdata, fullpath + ref->path_consumed);
217 } 218 }
218 219
219 /*cFYI(1,("%s: parent mountdata: %s", __func__,sb_mountdata));*/ 220 /*cFYI(1, "%s: parent mountdata: %s", __func__,sb_mountdata);*/
220 /*cFYI(1, ("%s: submount mountdata: %s", __func__, mountdata ));*/ 221 /*cFYI(1, "%s: submount mountdata: %s", __func__, mountdata );*/
221 222
222compose_mount_options_out: 223compose_mount_options_out:
223 kfree(srvIP); 224 kfree(srvIP);
@@ -293,11 +294,11 @@ static int add_mount_helper(struct vfsmount *newmnt, struct nameidata *nd,
293 294
294static void dump_referral(const struct dfs_info3_param *ref) 295static void dump_referral(const struct dfs_info3_param *ref)
295{ 296{
296 cFYI(1, ("DFS: ref path: %s", ref->path_name)); 297 cFYI(1, "DFS: ref path: %s", ref->path_name);
297 cFYI(1, ("DFS: node path: %s", ref->node_name)); 298 cFYI(1, "DFS: node path: %s", ref->node_name);
298 cFYI(1, ("DFS: fl: %hd, srv_type: %hd", ref->flags, ref->server_type)); 299 cFYI(1, "DFS: fl: %hd, srv_type: %hd", ref->flags, ref->server_type);
299 cFYI(1, ("DFS: ref_flags: %hd, path_consumed: %hd", ref->ref_flag, 300 cFYI(1, "DFS: ref_flags: %hd, path_consumed: %hd", ref->ref_flag,
300 ref->path_consumed)); 301 ref->path_consumed);
301} 302}
302 303
303 304
@@ -313,7 +314,7 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
313 int rc = 0; 314 int rc = 0;
314 struct vfsmount *mnt = ERR_PTR(-ENOENT); 315 struct vfsmount *mnt = ERR_PTR(-ENOENT);
315 316
316 cFYI(1, ("in %s", __func__)); 317 cFYI(1, "in %s", __func__);
317 BUG_ON(IS_ROOT(dentry)); 318 BUG_ON(IS_ROOT(dentry));
318 319
319 xid = GetXid(); 320 xid = GetXid();
@@ -351,15 +352,15 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
351 /* connect to a node */ 352 /* connect to a node */
352 len = strlen(referrals[i].node_name); 353 len = strlen(referrals[i].node_name);
353 if (len < 2) { 354 if (len < 2) {
354 cERROR(1, ("%s: Net Address path too short: %s", 355 cERROR(1, "%s: Net Address path too short: %s",
355 __func__, referrals[i].node_name)); 356 __func__, referrals[i].node_name);
356 rc = -EINVAL; 357 rc = -EINVAL;
357 goto out_err; 358 goto out_err;
358 } 359 }
359 mnt = cifs_dfs_do_refmount(nd->path.mnt, 360 mnt = cifs_dfs_do_refmount(nd->path.mnt,
360 nd->path.dentry, referrals + i); 361 nd->path.dentry, referrals + i);
361 cFYI(1, ("%s: cifs_dfs_do_refmount:%s , mnt:%p", __func__, 362 cFYI(1, "%s: cifs_dfs_do_refmount:%s , mnt:%p", __func__,
362 referrals[i].node_name, mnt)); 363 referrals[i].node_name, mnt);
363 364
364 /* complete mount procedure if we accured submount */ 365 /* complete mount procedure if we accured submount */
365 if (!IS_ERR(mnt)) 366 if (!IS_ERR(mnt))
@@ -377,7 +378,7 @@ out:
377 FreeXid(xid); 378 FreeXid(xid);
378 free_dfs_info_array(referrals, num_referrals); 379 free_dfs_info_array(referrals, num_referrals);
379 kfree(full_path); 380 kfree(full_path);
380 cFYI(1, ("leaving %s" , __func__)); 381 cFYI(1, "leaving %s" , __func__);
381 return ERR_PTR(rc); 382 return ERR_PTR(rc);
382out_err: 383out_err:
383 path_put(&nd->path); 384 path_put(&nd->path);
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 4797787c6a44..246a167cb913 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -18,6 +18,8 @@
18#ifndef _CIFS_FS_SB_H 18#ifndef _CIFS_FS_SB_H
19#define _CIFS_FS_SB_H 19#define _CIFS_FS_SB_H
20 20
21#include <linux/backing-dev.h>
22
21#define CIFS_MOUNT_NO_PERM 1 /* do not do client vfs_perm check */ 23#define CIFS_MOUNT_NO_PERM 1 /* do not do client vfs_perm check */
22#define CIFS_MOUNT_SET_UID 2 /* set current's euid in create etc. */ 24#define CIFS_MOUNT_SET_UID 2 /* set current's euid in create etc. */
23#define CIFS_MOUNT_SERVER_INUM 4 /* inode numbers from uniqueid from server */ 25#define CIFS_MOUNT_SERVER_INUM 4 /* inode numbers from uniqueid from server */
@@ -50,5 +52,6 @@ struct cifs_sb_info {
50#ifdef CONFIG_CIFS_DFS_UPCALL 52#ifdef CONFIG_CIFS_DFS_UPCALL
51 char *mountdata; /* mount options received at mount time */ 53 char *mountdata; /* mount options received at mount time */
52#endif 54#endif
55 struct backing_dev_info bdi;
53}; 56};
54#endif /* _CIFS_FS_SB_H */ 57#endif /* _CIFS_FS_SB_H */
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 8ec7736ce954..379bd7d9c05f 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -20,6 +20,7 @@
20 */ 20 */
21 21
22#include <linux/list.h> 22#include <linux/list.h>
23#include <linux/slab.h>
23#include <linux/string.h> 24#include <linux/string.h>
24#include <keys/user-type.h> 25#include <keys/user-type.h>
25#include <linux/key-type.h> 26#include <linux/key-type.h>
@@ -132,9 +133,9 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
132 dp = description + strlen(description); 133 dp = description + strlen(description);
133 134
134 /* for now, only sec=krb5 and sec=mskrb5 are valid */ 135 /* for now, only sec=krb5 and sec=mskrb5 are valid */
135 if (server->secType == Kerberos) 136 if (server->sec_kerberos)
136 sprintf(dp, ";sec=krb5"); 137 sprintf(dp, ";sec=krb5");
137 else if (server->secType == MSKerberos) 138 else if (server->sec_mskerberos)
138 sprintf(dp, ";sec=mskrb5"); 139 sprintf(dp, ";sec=mskrb5");
139 else 140 else
140 goto out; 141 goto out;
@@ -148,7 +149,7 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
148 dp = description + strlen(description); 149 dp = description + strlen(description);
149 sprintf(dp, ";pid=0x%x", current->pid); 150 sprintf(dp, ";pid=0x%x", current->pid);
150 151
151 cFYI(1, ("key description = %s", description)); 152 cFYI(1, "key description = %s", description);
152 spnego_key = request_key(&cifs_spnego_key_type, description, ""); 153 spnego_key = request_key(&cifs_spnego_key_type, description, "");
153 154
154#ifdef CONFIG_CIFS_DEBUG2 155#ifdef CONFIG_CIFS_DEBUG2
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 714a542cbafc..430f510a1720 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -19,6 +19,7 @@
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */ 20 */
21#include <linux/fs.h> 21#include <linux/fs.h>
22#include <linux/slab.h>
22#include "cifs_unicode.h" 23#include "cifs_unicode.h"
23#include "cifs_uniupr.h" 24#include "cifs_uniupr.h"
24#include "cifspdu.h" 25#include "cifspdu.h"
@@ -199,9 +200,8 @@ cifs_strtoUCS(__le16 *to, const char *from, int len,
199 /* works for 2.4.0 kernel or later */ 200 /* works for 2.4.0 kernel or later */
200 charlen = codepage->char2uni(from, len, &wchar_to[i]); 201 charlen = codepage->char2uni(from, len, &wchar_to[i]);
201 if (charlen < 1) { 202 if (charlen < 1) {
202 cERROR(1, 203 cERROR(1, "strtoUCS: char2uni of %d returned %d",
203 ("strtoUCS: char2uni of %d returned %d", 204 (int)*from, charlen);
204 (int)*from, charlen));
205 /* A question mark */ 205 /* A question mark */
206 to[i] = cpu_to_le16(0x003f); 206 to[i] = cpu_to_le16(0x003f);
207 charlen = 1; 207 charlen = 1;
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 7dfe0842a6f6..85d7cf7ff2c8 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -22,6 +22,7 @@
22 */ 22 */
23 23
24#include <linux/fs.h> 24#include <linux/fs.h>
25#include <linux/slab.h>
25#include "cifspdu.h" 26#include "cifspdu.h"
26#include "cifsglob.h" 27#include "cifsglob.h"
27#include "cifsacl.h" 28#include "cifsacl.h"
@@ -86,11 +87,11 @@ int match_sid(struct cifs_sid *ctsid)
86 continue; /* all sub_auth values do not match */ 87 continue; /* all sub_auth values do not match */
87 } 88 }
88 89
89 cFYI(1, ("matching sid: %s\n", wksidarr[i].sidname)); 90 cFYI(1, "matching sid: %s\n", wksidarr[i].sidname);
90 return 0; /* sids compare/match */ 91 return 0; /* sids compare/match */
91 } 92 }
92 93
93 cFYI(1, ("No matching sid")); 94 cFYI(1, "No matching sid");
94 return -1; 95 return -1;
95} 96}
96 97
@@ -207,14 +208,14 @@ static void access_flags_to_mode(__le32 ace_flags, int type, umode_t *pmode,
207 *pbits_to_set &= ~S_IXUGO; 208 *pbits_to_set &= ~S_IXUGO;
208 return; 209 return;
209 } else if (type != ACCESS_ALLOWED) { 210 } else if (type != ACCESS_ALLOWED) {
210 cERROR(1, ("unknown access control type %d", type)); 211 cERROR(1, "unknown access control type %d", type);
211 return; 212 return;
212 } 213 }
213 /* else ACCESS_ALLOWED type */ 214 /* else ACCESS_ALLOWED type */
214 215
215 if (flags & GENERIC_ALL) { 216 if (flags & GENERIC_ALL) {
216 *pmode |= (S_IRWXUGO & (*pbits_to_set)); 217 *pmode |= (S_IRWXUGO & (*pbits_to_set));
217 cFYI(DBG2, ("all perms")); 218 cFYI(DBG2, "all perms");
218 return; 219 return;
219 } 220 }
220 if ((flags & GENERIC_WRITE) || 221 if ((flags & GENERIC_WRITE) ||
@@ -227,7 +228,7 @@ static void access_flags_to_mode(__le32 ace_flags, int type, umode_t *pmode,
227 ((flags & FILE_EXEC_RIGHTS) == FILE_EXEC_RIGHTS)) 228 ((flags & FILE_EXEC_RIGHTS) == FILE_EXEC_RIGHTS))
228 *pmode |= (S_IXUGO & (*pbits_to_set)); 229 *pmode |= (S_IXUGO & (*pbits_to_set));
229 230
230 cFYI(DBG2, ("access flags 0x%x mode now 0x%x", flags, *pmode)); 231 cFYI(DBG2, "access flags 0x%x mode now 0x%x", flags, *pmode);
231 return; 232 return;
232} 233}
233 234
@@ -256,7 +257,7 @@ static void mode_to_access_flags(umode_t mode, umode_t bits_to_use,
256 if (mode & S_IXUGO) 257 if (mode & S_IXUGO)
257 *pace_flags |= SET_FILE_EXEC_RIGHTS; 258 *pace_flags |= SET_FILE_EXEC_RIGHTS;
258 259
259 cFYI(DBG2, ("mode: 0x%x, access flags now 0x%x", mode, *pace_flags)); 260 cFYI(DBG2, "mode: 0x%x, access flags now 0x%x", mode, *pace_flags);
260 return; 261 return;
261} 262}
262 263
@@ -296,24 +297,24 @@ static void dump_ace(struct cifs_ace *pace, char *end_of_acl)
296 /* validate that we do not go past end of acl */ 297 /* validate that we do not go past end of acl */
297 298
298 if (le16_to_cpu(pace->size) < 16) { 299 if (le16_to_cpu(pace->size) < 16) {
299 cERROR(1, ("ACE too small, %d", le16_to_cpu(pace->size))); 300 cERROR(1, "ACE too small %d", le16_to_cpu(pace->size));
300 return; 301 return;
301 } 302 }
302 303
303 if (end_of_acl < (char *)pace + le16_to_cpu(pace->size)) { 304 if (end_of_acl < (char *)pace + le16_to_cpu(pace->size)) {
304 cERROR(1, ("ACL too small to parse ACE")); 305 cERROR(1, "ACL too small to parse ACE");
305 return; 306 return;
306 } 307 }
307 308
308 num_subauth = pace->sid.num_subauth; 309 num_subauth = pace->sid.num_subauth;
309 if (num_subauth) { 310 if (num_subauth) {
310 int i; 311 int i;
311 cFYI(1, ("ACE revision %d num_auth %d type %d flags %d size %d", 312 cFYI(1, "ACE revision %d num_auth %d type %d flags %d size %d",
312 pace->sid.revision, pace->sid.num_subauth, pace->type, 313 pace->sid.revision, pace->sid.num_subauth, pace->type,
313 pace->flags, le16_to_cpu(pace->size))); 314 pace->flags, le16_to_cpu(pace->size));
314 for (i = 0; i < num_subauth; ++i) { 315 for (i = 0; i < num_subauth; ++i) {
315 cFYI(1, ("ACE sub_auth[%d]: 0x%x", i, 316 cFYI(1, "ACE sub_auth[%d]: 0x%x", i,
316 le32_to_cpu(pace->sid.sub_auth[i]))); 317 le32_to_cpu(pace->sid.sub_auth[i]));
317 } 318 }
318 319
319 /* BB add length check to make sure that we do not have huge 320 /* BB add length check to make sure that we do not have huge
@@ -346,13 +347,13 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
346 347
347 /* validate that we do not go past end of acl */ 348 /* validate that we do not go past end of acl */
348 if (end_of_acl < (char *)pdacl + le16_to_cpu(pdacl->size)) { 349 if (end_of_acl < (char *)pdacl + le16_to_cpu(pdacl->size)) {
349 cERROR(1, ("ACL too small to parse DACL")); 350 cERROR(1, "ACL too small to parse DACL");
350 return; 351 return;
351 } 352 }
352 353
353 cFYI(DBG2, ("DACL revision %d size %d num aces %d", 354 cFYI(DBG2, "DACL revision %d size %d num aces %d",
354 le16_to_cpu(pdacl->revision), le16_to_cpu(pdacl->size), 355 le16_to_cpu(pdacl->revision), le16_to_cpu(pdacl->size),
355 le32_to_cpu(pdacl->num_aces))); 356 le32_to_cpu(pdacl->num_aces));
356 357
357 /* reset rwx permissions for user/group/other. 358 /* reset rwx permissions for user/group/other.
358 Also, if num_aces is 0 i.e. DACL has no ACEs, 359 Also, if num_aces is 0 i.e. DACL has no ACEs,
@@ -436,25 +437,25 @@ static int parse_sid(struct cifs_sid *psid, char *end_of_acl)
436 /* validate that we do not go past end of ACL - sid must be at least 8 437 /* validate that we do not go past end of ACL - sid must be at least 8
437 bytes long (assuming no sub-auths - e.g. the null SID */ 438 bytes long (assuming no sub-auths - e.g. the null SID */
438 if (end_of_acl < (char *)psid + 8) { 439 if (end_of_acl < (char *)psid + 8) {
439 cERROR(1, ("ACL too small to parse SID %p", psid)); 440 cERROR(1, "ACL too small to parse SID %p", psid);
440 return -EINVAL; 441 return -EINVAL;
441 } 442 }
442 443
443 if (psid->num_subauth) { 444 if (psid->num_subauth) {
444#ifdef CONFIG_CIFS_DEBUG2 445#ifdef CONFIG_CIFS_DEBUG2
445 int i; 446 int i;
446 cFYI(1, ("SID revision %d num_auth %d", 447 cFYI(1, "SID revision %d num_auth %d",
447 psid->revision, psid->num_subauth)); 448 psid->revision, psid->num_subauth);
448 449
449 for (i = 0; i < psid->num_subauth; i++) { 450 for (i = 0; i < psid->num_subauth; i++) {
450 cFYI(1, ("SID sub_auth[%d]: 0x%x ", i, 451 cFYI(1, "SID sub_auth[%d]: 0x%x ", i,
451 le32_to_cpu(psid->sub_auth[i]))); 452 le32_to_cpu(psid->sub_auth[i]));
452 } 453 }
453 454
454 /* BB add length check to make sure that we do not have huge 455 /* BB add length check to make sure that we do not have huge
455 num auths and therefore go off the end */ 456 num auths and therefore go off the end */
456 cFYI(1, ("RID 0x%x", 457 cFYI(1, "RID 0x%x",
457 le32_to_cpu(psid->sub_auth[psid->num_subauth-1]))); 458 le32_to_cpu(psid->sub_auth[psid->num_subauth-1]));
458#endif 459#endif
459 } 460 }
460 461
@@ -481,11 +482,11 @@ static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len,
481 le32_to_cpu(pntsd->gsidoffset)); 482 le32_to_cpu(pntsd->gsidoffset));
482 dacloffset = le32_to_cpu(pntsd->dacloffset); 483 dacloffset = le32_to_cpu(pntsd->dacloffset);
483 dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset); 484 dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset);
484 cFYI(DBG2, ("revision %d type 0x%x ooffset 0x%x goffset 0x%x " 485 cFYI(DBG2, "revision %d type 0x%x ooffset 0x%x goffset 0x%x "
485 "sacloffset 0x%x dacloffset 0x%x", 486 "sacloffset 0x%x dacloffset 0x%x",
486 pntsd->revision, pntsd->type, le32_to_cpu(pntsd->osidoffset), 487 pntsd->revision, pntsd->type, le32_to_cpu(pntsd->osidoffset),
487 le32_to_cpu(pntsd->gsidoffset), 488 le32_to_cpu(pntsd->gsidoffset),
488 le32_to_cpu(pntsd->sacloffset), dacloffset)); 489 le32_to_cpu(pntsd->sacloffset), dacloffset);
489/* cifs_dump_mem("owner_sid: ", owner_sid_ptr, 64); */ 490/* cifs_dump_mem("owner_sid: ", owner_sid_ptr, 64); */
490 rc = parse_sid(owner_sid_ptr, end_of_acl); 491 rc = parse_sid(owner_sid_ptr, end_of_acl);
491 if (rc) 492 if (rc)
@@ -499,7 +500,7 @@ static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len,
499 parse_dacl(dacl_ptr, end_of_acl, owner_sid_ptr, 500 parse_dacl(dacl_ptr, end_of_acl, owner_sid_ptr,
500 group_sid_ptr, fattr); 501 group_sid_ptr, fattr);
501 else 502 else
502 cFYI(1, ("no ACL")); /* BB grant all or default perms? */ 503 cFYI(1, "no ACL"); /* BB grant all or default perms? */
503 504
504/* cifscred->uid = owner_sid_ptr->rid; 505/* cifscred->uid = owner_sid_ptr->rid;
505 cifscred->gid = group_sid_ptr->rid; 506 cifscred->gid = group_sid_ptr->rid;
@@ -562,7 +563,7 @@ static struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
562 FreeXid(xid); 563 FreeXid(xid);
563 564
564 565
565 cFYI(1, ("GetCIFSACL rc = %d ACL len %d", rc, *pacllen)); 566 cFYI(1, "GetCIFSACL rc = %d ACL len %d", rc, *pacllen);
566 return pntsd; 567 return pntsd;
567} 568}
568 569
@@ -580,12 +581,12 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
580 &fid, &oplock, NULL, cifs_sb->local_nls, 581 &fid, &oplock, NULL, cifs_sb->local_nls,
581 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 582 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
582 if (rc) { 583 if (rc) {
583 cERROR(1, ("Unable to open file to get ACL")); 584 cERROR(1, "Unable to open file to get ACL");
584 goto out; 585 goto out;
585 } 586 }
586 587
587 rc = CIFSSMBGetCIFSACL(xid, cifs_sb->tcon, fid, &pntsd, pacllen); 588 rc = CIFSSMBGetCIFSACL(xid, cifs_sb->tcon, fid, &pntsd, pacllen);
588 cFYI(1, ("GetCIFSACL rc = %d ACL len %d", rc, *pacllen)); 589 cFYI(1, "GetCIFSACL rc = %d ACL len %d", rc, *pacllen);
589 590
590 CIFSSMBClose(xid, cifs_sb->tcon, fid); 591 CIFSSMBClose(xid, cifs_sb->tcon, fid);
591 out: 592 out:
@@ -620,7 +621,7 @@ static int set_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb, __u16 fid,
620 rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen); 621 rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen);
621 FreeXid(xid); 622 FreeXid(xid);
622 623
623 cFYI(DBG2, ("SetCIFSACL rc = %d", rc)); 624 cFYI(DBG2, "SetCIFSACL rc = %d", rc);
624 return rc; 625 return rc;
625} 626}
626 627
@@ -637,12 +638,12 @@ static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path,
637 &fid, &oplock, NULL, cifs_sb->local_nls, 638 &fid, &oplock, NULL, cifs_sb->local_nls,
638 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 639 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
639 if (rc) { 640 if (rc) {
640 cERROR(1, ("Unable to open file to set ACL")); 641 cERROR(1, "Unable to open file to set ACL");
641 goto out; 642 goto out;
642 } 643 }
643 644
644 rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen); 645 rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen);
645 cFYI(DBG2, ("SetCIFSACL rc = %d", rc)); 646 cFYI(DBG2, "SetCIFSACL rc = %d", rc);
646 647
647 CIFSSMBClose(xid, cifs_sb->tcon, fid); 648 CIFSSMBClose(xid, cifs_sb->tcon, fid);
648 out: 649 out:
@@ -658,7 +659,7 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
658 struct cifsFileInfo *open_file; 659 struct cifsFileInfo *open_file;
659 int rc; 660 int rc;
660 661
661 cFYI(DBG2, ("set ACL for %s from mode 0x%x", path, inode->i_mode)); 662 cFYI(DBG2, "set ACL for %s from mode 0x%x", path, inode->i_mode);
662 663
663 open_file = find_readable_file(CIFS_I(inode)); 664 open_file = find_readable_file(CIFS_I(inode));
664 if (!open_file) 665 if (!open_file)
@@ -678,7 +679,7 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
678 u32 acllen = 0; 679 u32 acllen = 0;
679 int rc = 0; 680 int rc = 0;
680 681
681 cFYI(DBG2, ("converting ACL to mode for %s", path)); 682 cFYI(DBG2, "converting ACL to mode for %s", path);
682 683
683 if (pfid) 684 if (pfid)
684 pntsd = get_cifs_acl_by_fid(cifs_sb, *pfid, &acllen); 685 pntsd = get_cifs_acl_by_fid(cifs_sb, *pfid, &acllen);
@@ -689,7 +690,7 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
689 if (pntsd) 690 if (pntsd)
690 rc = parse_sec_desc(pntsd, acllen, fattr); 691 rc = parse_sec_desc(pntsd, acllen, fattr);
691 if (rc) 692 if (rc)
692 cFYI(1, ("parse sec desc failed rc = %d", rc)); 693 cFYI(1, "parse sec desc failed rc = %d", rc);
693 694
694 kfree(pntsd); 695 kfree(pntsd);
695 return; 696 return;
@@ -703,7 +704,7 @@ int mode_to_acl(struct inode *inode, const char *path, __u64 nmode)
703 struct cifs_ntsd *pntsd = NULL; /* acl obtained from server */ 704 struct cifs_ntsd *pntsd = NULL; /* acl obtained from server */
704 struct cifs_ntsd *pnntsd = NULL; /* modified acl to be sent to server */ 705 struct cifs_ntsd *pnntsd = NULL; /* modified acl to be sent to server */
705 706
706 cFYI(DBG2, ("set ACL from mode for %s", path)); 707 cFYI(DBG2, "set ACL from mode for %s", path);
707 708
708 /* Get the security descriptor */ 709 /* Get the security descriptor */
709 pntsd = get_cifs_acl(CIFS_SB(inode->i_sb), inode, path, &secdesclen); 710 pntsd = get_cifs_acl(CIFS_SB(inode->i_sb), inode, path, &secdesclen);
@@ -720,19 +721,19 @@ int mode_to_acl(struct inode *inode, const char *path, __u64 nmode)
720 DEFSECDESCLEN : secdesclen; 721 DEFSECDESCLEN : secdesclen;
721 pnntsd = kmalloc(secdesclen, GFP_KERNEL); 722 pnntsd = kmalloc(secdesclen, GFP_KERNEL);
722 if (!pnntsd) { 723 if (!pnntsd) {
723 cERROR(1, ("Unable to allocate security descriptor")); 724 cERROR(1, "Unable to allocate security descriptor");
724 kfree(pntsd); 725 kfree(pntsd);
725 return -ENOMEM; 726 return -ENOMEM;
726 } 727 }
727 728
728 rc = build_sec_desc(pntsd, pnntsd, inode, nmode); 729 rc = build_sec_desc(pntsd, pnntsd, inode, nmode);
729 730
730 cFYI(DBG2, ("build_sec_desc rc: %d", rc)); 731 cFYI(DBG2, "build_sec_desc rc: %d", rc);
731 732
732 if (!rc) { 733 if (!rc) {
733 /* Set the security descriptor */ 734 /* Set the security descriptor */
734 rc = set_cifs_acl(pnntsd, secdesclen, inode, path); 735 rc = set_cifs_acl(pnntsd, secdesclen, inode, path);
735 cFYI(DBG2, ("set_cifs_acl rc: %d", rc)); 736 cFYI(DBG2, "set_cifs_acl rc: %d", rc);
736 } 737 }
737 738
738 kfree(pnntsd); 739 kfree(pnntsd);
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 7efe1745494d..847628dfdc44 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -20,6 +20,7 @@
20 */ 20 */
21 21
22#include <linux/fs.h> 22#include <linux/fs.h>
23#include <linux/slab.h>
23#include "cifspdu.h" 24#include "cifspdu.h"
24#include "cifsglob.h" 25#include "cifsglob.h"
25#include "cifs_debug.h" 26#include "cifs_debug.h"
@@ -102,7 +103,7 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
102 if (iov[i].iov_len == 0) 103 if (iov[i].iov_len == 0)
103 continue; 104 continue;
104 if (iov[i].iov_base == NULL) { 105 if (iov[i].iov_base == NULL) {
105 cERROR(1, ("null iovec entry")); 106 cERROR(1, "null iovec entry");
106 return -EIO; 107 return -EIO;
107 } 108 }
108 /* The first entry includes a length field (which does not get 109 /* The first entry includes a length field (which does not get
@@ -180,8 +181,8 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu,
180 181
181 /* Do not need to verify session setups with signature "BSRSPYL " */ 182 /* Do not need to verify session setups with signature "BSRSPYL " */
182 if (memcmp(cifs_pdu->Signature.SecuritySignature, "BSRSPYL ", 8) == 0) 183 if (memcmp(cifs_pdu->Signature.SecuritySignature, "BSRSPYL ", 8) == 0)
183 cFYI(1, ("dummy signature received for smb command 0x%x", 184 cFYI(1, "dummy signature received for smb command 0x%x",
184 cifs_pdu->Command)); 185 cifs_pdu->Command);
185 186
186 /* save off the origiginal signature so we can modify the smb and check 187 /* save off the origiginal signature so we can modify the smb and check
187 its signature against what the server sent */ 188 its signature against what the server sent */
@@ -290,7 +291,7 @@ void calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt,
290 if (password) 291 if (password)
291 strncpy(password_with_pad, password, CIFS_ENCPWD_SIZE); 292 strncpy(password_with_pad, password, CIFS_ENCPWD_SIZE);
292 293
293 if (!encrypt && extended_security & CIFSSEC_MAY_PLNTXT) { 294 if (!encrypt && global_secflags & CIFSSEC_MAY_PLNTXT) {
294 memset(lnm_session_key, 0, CIFS_SESS_KEY_SIZE); 295 memset(lnm_session_key, 0, CIFS_SESS_KEY_SIZE);
295 memcpy(lnm_session_key, password_with_pad, 296 memcpy(lnm_session_key, password_with_pad,
296 CIFS_ENCPWD_SIZE); 297 CIFS_ENCPWD_SIZE);
@@ -397,7 +398,7 @@ void setup_ntlmv2_rsp(struct cifsSesInfo *ses, char *resp_buf,
397 /* calculate buf->ntlmv2_hash */ 398 /* calculate buf->ntlmv2_hash */
398 rc = calc_ntlmv2_hash(ses, nls_cp); 399 rc = calc_ntlmv2_hash(ses, nls_cp);
399 if (rc) 400 if (rc)
400 cERROR(1, ("could not get v2 hash rc %d", rc)); 401 cERROR(1, "could not get v2 hash rc %d", rc);
401 CalcNTLMv2_response(ses, resp_buf); 402 CalcNTLMv2_response(ses, resp_buf);
402 403
403 /* now calculate the MAC key for NTLMv2 */ 404 /* now calculate the MAC key for NTLMv2 */
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 5183bc2a1916..78c02eb4cb1f 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -49,10 +49,6 @@
49#include "cifs_spnego.h" 49#include "cifs_spnego.h"
50#define CIFS_MAGIC_NUMBER 0xFF534D42 /* the first four bytes of SMB PDUs */ 50#define CIFS_MAGIC_NUMBER 0xFF534D42 /* the first four bytes of SMB PDUs */
51 51
52#ifdef CONFIG_CIFS_QUOTA
53static const struct quotactl_ops cifs_quotactl_ops;
54#endif /* QUOTA */
55
56int cifsFYI = 0; 52int cifsFYI = 0;
57int cifsERROR = 1; 53int cifsERROR = 1;
58int traceSMB = 0; 54int traceSMB = 0;
@@ -61,7 +57,7 @@ unsigned int experimEnabled = 0;
61unsigned int linuxExtEnabled = 1; 57unsigned int linuxExtEnabled = 1;
62unsigned int lookupCacheEnabled = 1; 58unsigned int lookupCacheEnabled = 1;
63unsigned int multiuser_mount = 0; 59unsigned int multiuser_mount = 0;
64unsigned int extended_security = CIFSSEC_DEF; 60unsigned int global_secflags = CIFSSEC_DEF;
65/* unsigned int ntlmv2_support = 0; */ 61/* unsigned int ntlmv2_support = 0; */
66unsigned int sign_CIFS_PDUs = 1; 62unsigned int sign_CIFS_PDUs = 1;
67static const struct super_operations cifs_super_ops; 63static const struct super_operations cifs_super_ops;
@@ -86,8 +82,6 @@ extern mempool_t *cifs_sm_req_poolp;
86extern mempool_t *cifs_req_poolp; 82extern mempool_t *cifs_req_poolp;
87extern mempool_t *cifs_mid_poolp; 83extern mempool_t *cifs_mid_poolp;
88 84
89extern struct kmem_cache *cifs_oplock_cachep;
90
91static int 85static int
92cifs_read_super(struct super_block *sb, void *data, 86cifs_read_super(struct super_block *sb, void *data,
93 const char *devname, int silent) 87 const char *devname, int silent)
@@ -103,6 +97,12 @@ cifs_read_super(struct super_block *sb, void *data,
103 if (cifs_sb == NULL) 97 if (cifs_sb == NULL)
104 return -ENOMEM; 98 return -ENOMEM;
105 99
100 rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs", BDI_CAP_MAP_COPY);
101 if (rc) {
102 kfree(cifs_sb);
103 return rc;
104 }
105
106#ifdef CONFIG_CIFS_DFS_UPCALL 106#ifdef CONFIG_CIFS_DFS_UPCALL
107 /* copy mount params to sb for use in submounts */ 107 /* copy mount params to sb for use in submounts */
108 /* BB: should we move this after the mount so we 108 /* BB: should we move this after the mount so we
@@ -115,6 +115,7 @@ cifs_read_super(struct super_block *sb, void *data,
115 int len = strlen(data); 115 int len = strlen(data);
116 cifs_sb->mountdata = kzalloc(len + 1, GFP_KERNEL); 116 cifs_sb->mountdata = kzalloc(len + 1, GFP_KERNEL);
117 if (cifs_sb->mountdata == NULL) { 117 if (cifs_sb->mountdata == NULL) {
118 bdi_destroy(&cifs_sb->bdi);
118 kfree(sb->s_fs_info); 119 kfree(sb->s_fs_info);
119 sb->s_fs_info = NULL; 120 sb->s_fs_info = NULL;
120 return -ENOMEM; 121 return -ENOMEM;
@@ -128,19 +129,16 @@ cifs_read_super(struct super_block *sb, void *data,
128 129
129 if (rc) { 130 if (rc) {
130 if (!silent) 131 if (!silent)
131 cERROR(1, 132 cERROR(1, "cifs_mount failed w/return code = %d", rc);
132 ("cifs_mount failed w/return code = %d", rc));
133 goto out_mount_failed; 133 goto out_mount_failed;
134 } 134 }
135 135
136 sb->s_magic = CIFS_MAGIC_NUMBER; 136 sb->s_magic = CIFS_MAGIC_NUMBER;
137 sb->s_op = &cifs_super_ops; 137 sb->s_op = &cifs_super_ops;
138 sb->s_bdi = &cifs_sb->bdi;
138/* if (cifs_sb->tcon->ses->server->maxBuf > MAX_CIFS_HDR_SIZE + 512) 139/* if (cifs_sb->tcon->ses->server->maxBuf > MAX_CIFS_HDR_SIZE + 512)
139 sb->s_blocksize = 140 sb->s_blocksize =
140 cifs_sb->tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE; */ 141 cifs_sb->tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE; */
141#ifdef CONFIG_CIFS_QUOTA
142 sb->s_qcop = &cifs_quotactl_ops;
143#endif
144 sb->s_blocksize = CIFS_MAX_MSGSIZE; 142 sb->s_blocksize = CIFS_MAX_MSGSIZE;
145 sb->s_blocksize_bits = 14; /* default 2**14 = CIFS_MAX_MSGSIZE */ 143 sb->s_blocksize_bits = 14; /* default 2**14 = CIFS_MAX_MSGSIZE */
146 inode = cifs_root_iget(sb, ROOT_I); 144 inode = cifs_root_iget(sb, ROOT_I);
@@ -160,7 +158,7 @@ cifs_read_super(struct super_block *sb, void *data,
160 158
161#ifdef CONFIG_CIFS_EXPERIMENTAL 159#ifdef CONFIG_CIFS_EXPERIMENTAL
162 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) { 160 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
163 cFYI(1, ("export ops supported")); 161 cFYI(1, "export ops supported");
164 sb->s_export_op = &cifs_export_ops; 162 sb->s_export_op = &cifs_export_ops;
165 } 163 }
166#endif /* EXPERIMENTAL */ 164#endif /* EXPERIMENTAL */
@@ -168,7 +166,7 @@ cifs_read_super(struct super_block *sb, void *data,
168 return 0; 166 return 0;
169 167
170out_no_root: 168out_no_root:
171 cERROR(1, ("cifs_read_super: get root inode failed")); 169 cERROR(1, "cifs_read_super: get root inode failed");
172 if (inode) 170 if (inode)
173 iput(inode); 171 iput(inode);
174 172
@@ -183,6 +181,7 @@ out_mount_failed:
183 } 181 }
184#endif 182#endif
185 unload_nls(cifs_sb->local_nls); 183 unload_nls(cifs_sb->local_nls);
184 bdi_destroy(&cifs_sb->bdi);
186 kfree(cifs_sb); 185 kfree(cifs_sb);
187 } 186 }
188 return rc; 187 return rc;
@@ -194,10 +193,10 @@ cifs_put_super(struct super_block *sb)
194 int rc = 0; 193 int rc = 0;
195 struct cifs_sb_info *cifs_sb; 194 struct cifs_sb_info *cifs_sb;
196 195
197 cFYI(1, ("In cifs_put_super")); 196 cFYI(1, "In cifs_put_super");
198 cifs_sb = CIFS_SB(sb); 197 cifs_sb = CIFS_SB(sb);
199 if (cifs_sb == NULL) { 198 if (cifs_sb == NULL) {
200 cFYI(1, ("Empty cifs superblock info passed to unmount")); 199 cFYI(1, "Empty cifs superblock info passed to unmount");
201 return; 200 return;
202 } 201 }
203 202
@@ -205,7 +204,7 @@ cifs_put_super(struct super_block *sb)
205 204
206 rc = cifs_umount(sb, cifs_sb); 205 rc = cifs_umount(sb, cifs_sb);
207 if (rc) 206 if (rc)
208 cERROR(1, ("cifs_umount failed with return code %d", rc)); 207 cERROR(1, "cifs_umount failed with return code %d", rc);
209#ifdef CONFIG_CIFS_DFS_UPCALL 208#ifdef CONFIG_CIFS_DFS_UPCALL
210 if (cifs_sb->mountdata) { 209 if (cifs_sb->mountdata) {
211 kfree(cifs_sb->mountdata); 210 kfree(cifs_sb->mountdata);
@@ -214,6 +213,7 @@ cifs_put_super(struct super_block *sb)
214#endif 213#endif
215 214
216 unload_nls(cifs_sb->local_nls); 215 unload_nls(cifs_sb->local_nls);
216 bdi_destroy(&cifs_sb->bdi);
217 kfree(cifs_sb); 217 kfree(cifs_sb);
218 218
219 unlock_kernel(); 219 unlock_kernel();
@@ -290,7 +290,6 @@ static int cifs_permission(struct inode *inode, int mask)
290static struct kmem_cache *cifs_inode_cachep; 290static struct kmem_cache *cifs_inode_cachep;
291static struct kmem_cache *cifs_req_cachep; 291static struct kmem_cache *cifs_req_cachep;
292static struct kmem_cache *cifs_mid_cachep; 292static struct kmem_cache *cifs_mid_cachep;
293struct kmem_cache *cifs_oplock_cachep;
294static struct kmem_cache *cifs_sm_req_cachep; 293static struct kmem_cache *cifs_sm_req_cachep;
295mempool_t *cifs_sm_req_poolp; 294mempool_t *cifs_sm_req_poolp;
296mempool_t *cifs_req_poolp; 295mempool_t *cifs_req_poolp;
@@ -422,106 +421,6 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m)
422 return 0; 421 return 0;
423} 422}
424 423
425#ifdef CONFIG_CIFS_QUOTA
426int cifs_xquota_set(struct super_block *sb, int quota_type, qid_t qid,
427 struct fs_disk_quota *pdquota)
428{
429 int xid;
430 int rc = 0;
431 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
432 struct cifsTconInfo *pTcon;
433
434 if (cifs_sb)
435 pTcon = cifs_sb->tcon;
436 else
437 return -EIO;
438
439
440 xid = GetXid();
441 if (pTcon) {
442 cFYI(1, ("set type: 0x%x id: %d", quota_type, qid));
443 } else
444 rc = -EIO;
445
446 FreeXid(xid);
447 return rc;
448}
449
450int cifs_xquota_get(struct super_block *sb, int quota_type, qid_t qid,
451 struct fs_disk_quota *pdquota)
452{
453 int xid;
454 int rc = 0;
455 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
456 struct cifsTconInfo *pTcon;
457
458 if (cifs_sb)
459 pTcon = cifs_sb->tcon;
460 else
461 return -EIO;
462
463 xid = GetXid();
464 if (pTcon) {
465 cFYI(1, ("set type: 0x%x id: %d", quota_type, qid));
466 } else
467 rc = -EIO;
468
469 FreeXid(xid);
470 return rc;
471}
472
473int cifs_xstate_set(struct super_block *sb, unsigned int flags, int operation)
474{
475 int xid;
476 int rc = 0;
477 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
478 struct cifsTconInfo *pTcon;
479
480 if (cifs_sb)
481 pTcon = cifs_sb->tcon;
482 else
483 return -EIO;
484
485 xid = GetXid();
486 if (pTcon) {
487 cFYI(1, ("flags: 0x%x operation: 0x%x", flags, operation));
488 } else
489 rc = -EIO;
490
491 FreeXid(xid);
492 return rc;
493}
494
495int cifs_xstate_get(struct super_block *sb, struct fs_quota_stat *qstats)
496{
497 int xid;
498 int rc = 0;
499 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
500 struct cifsTconInfo *pTcon;
501
502 if (cifs_sb)
503 pTcon = cifs_sb->tcon;
504 else
505 return -EIO;
506
507 xid = GetXid();
508 if (pTcon) {
509 cFYI(1, ("pqstats %p", qstats));
510 } else
511 rc = -EIO;
512
513 FreeXid(xid);
514 return rc;
515}
516
517static const struct quotactl_ops cifs_quotactl_ops = {
518 .set_xquota = cifs_xquota_set,
519 .get_xquota = cifs_xquota_get,
520 .set_xstate = cifs_xstate_set,
521 .get_xstate = cifs_xstate_get,
522};
523#endif
524
525static void cifs_umount_begin(struct super_block *sb) 424static void cifs_umount_begin(struct super_block *sb)
526{ 425{
527 struct cifs_sb_info *cifs_sb = CIFS_SB(sb); 426 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
@@ -548,7 +447,7 @@ static void cifs_umount_begin(struct super_block *sb)
548 /* cancel_brl_requests(tcon); */ /* BB mark all brl mids as exiting */ 447 /* cancel_brl_requests(tcon); */ /* BB mark all brl mids as exiting */
549 /* cancel_notify_requests(tcon); */ 448 /* cancel_notify_requests(tcon); */
550 if (tcon->ses && tcon->ses->server) { 449 if (tcon->ses && tcon->ses->server) {
551 cFYI(1, ("wake up tasks now - umount begin not complete")); 450 cFYI(1, "wake up tasks now - umount begin not complete");
552 wake_up_all(&tcon->ses->server->request_q); 451 wake_up_all(&tcon->ses->server->request_q);
553 wake_up_all(&tcon->ses->server->response_q); 452 wake_up_all(&tcon->ses->server->response_q);
554 msleep(1); /* yield */ 453 msleep(1); /* yield */
@@ -599,7 +498,7 @@ cifs_get_sb(struct file_system_type *fs_type,
599 int rc; 498 int rc;
600 struct super_block *sb = sget(fs_type, NULL, set_anon_super, NULL); 499 struct super_block *sb = sget(fs_type, NULL, set_anon_super, NULL);
601 500
602 cFYI(1, ("Devname: %s flags: %d ", dev_name, flags)); 501 cFYI(1, "Devname: %s flags: %d ", dev_name, flags);
603 502
604 if (IS_ERR(sb)) 503 if (IS_ERR(sb))
605 return PTR_ERR(sb); 504 return PTR_ERR(sb);
@@ -646,7 +545,6 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
646 return generic_file_llseek_unlocked(file, offset, origin); 545 return generic_file_llseek_unlocked(file, offset, origin);
647} 546}
648 547
649#ifdef CONFIG_CIFS_EXPERIMENTAL
650static int cifs_setlease(struct file *file, long arg, struct file_lock **lease) 548static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
651{ 549{
652 /* note that this is called by vfs setlease with the BKL held 550 /* note that this is called by vfs setlease with the BKL held
@@ -675,7 +573,6 @@ static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
675 else 573 else
676 return -EAGAIN; 574 return -EAGAIN;
677} 575}
678#endif
679 576
680struct file_system_type cifs_fs_type = { 577struct file_system_type cifs_fs_type = {
681 .owner = THIS_MODULE, 578 .owner = THIS_MODULE,
@@ -752,10 +649,7 @@ const struct file_operations cifs_file_ops = {
752#ifdef CONFIG_CIFS_POSIX 649#ifdef CONFIG_CIFS_POSIX
753 .unlocked_ioctl = cifs_ioctl, 650 .unlocked_ioctl = cifs_ioctl,
754#endif /* CONFIG_CIFS_POSIX */ 651#endif /* CONFIG_CIFS_POSIX */
755
756#ifdef CONFIG_CIFS_EXPERIMENTAL
757 .setlease = cifs_setlease, 652 .setlease = cifs_setlease,
758#endif /* CONFIG_CIFS_EXPERIMENTAL */
759}; 653};
760 654
761const struct file_operations cifs_file_direct_ops = { 655const struct file_operations cifs_file_direct_ops = {
@@ -774,9 +668,7 @@ const struct file_operations cifs_file_direct_ops = {
774 .unlocked_ioctl = cifs_ioctl, 668 .unlocked_ioctl = cifs_ioctl,
775#endif /* CONFIG_CIFS_POSIX */ 669#endif /* CONFIG_CIFS_POSIX */
776 .llseek = cifs_llseek, 670 .llseek = cifs_llseek,
777#ifdef CONFIG_CIFS_EXPERIMENTAL
778 .setlease = cifs_setlease, 671 .setlease = cifs_setlease,
779#endif /* CONFIG_CIFS_EXPERIMENTAL */
780}; 672};
781const struct file_operations cifs_file_nobrl_ops = { 673const struct file_operations cifs_file_nobrl_ops = {
782 .read = do_sync_read, 674 .read = do_sync_read,
@@ -793,10 +685,7 @@ const struct file_operations cifs_file_nobrl_ops = {
793#ifdef CONFIG_CIFS_POSIX 685#ifdef CONFIG_CIFS_POSIX
794 .unlocked_ioctl = cifs_ioctl, 686 .unlocked_ioctl = cifs_ioctl,
795#endif /* CONFIG_CIFS_POSIX */ 687#endif /* CONFIG_CIFS_POSIX */
796
797#ifdef CONFIG_CIFS_EXPERIMENTAL
798 .setlease = cifs_setlease, 688 .setlease = cifs_setlease,
799#endif /* CONFIG_CIFS_EXPERIMENTAL */
800}; 689};
801 690
802const struct file_operations cifs_file_direct_nobrl_ops = { 691const struct file_operations cifs_file_direct_nobrl_ops = {
@@ -808,14 +697,13 @@ const struct file_operations cifs_file_direct_nobrl_ops = {
808 .release = cifs_close, 697 .release = cifs_close,
809 .fsync = cifs_fsync, 698 .fsync = cifs_fsync,
810 .flush = cifs_flush, 699 .flush = cifs_flush,
700 .mmap = cifs_file_mmap,
811 .splice_read = generic_file_splice_read, 701 .splice_read = generic_file_splice_read,
812#ifdef CONFIG_CIFS_POSIX 702#ifdef CONFIG_CIFS_POSIX
813 .unlocked_ioctl = cifs_ioctl, 703 .unlocked_ioctl = cifs_ioctl,
814#endif /* CONFIG_CIFS_POSIX */ 704#endif /* CONFIG_CIFS_POSIX */
815 .llseek = cifs_llseek, 705 .llseek = cifs_llseek,
816#ifdef CONFIG_CIFS_EXPERIMENTAL
817 .setlease = cifs_setlease, 706 .setlease = cifs_setlease,
818#endif /* CONFIG_CIFS_EXPERIMENTAL */
819}; 707};
820 708
821const struct file_operations cifs_dir_ops = { 709const struct file_operations cifs_dir_ops = {
@@ -867,7 +755,7 @@ cifs_init_request_bufs(void)
867 } else { 755 } else {
868 CIFSMaxBufSize &= 0x1FE00; /* Round size to even 512 byte mult*/ 756 CIFSMaxBufSize &= 0x1FE00; /* Round size to even 512 byte mult*/
869 } 757 }
870/* cERROR(1,("CIFSMaxBufSize %d 0x%x",CIFSMaxBufSize,CIFSMaxBufSize)); */ 758/* cERROR(1, "CIFSMaxBufSize %d 0x%x",CIFSMaxBufSize,CIFSMaxBufSize); */
871 cifs_req_cachep = kmem_cache_create("cifs_request", 759 cifs_req_cachep = kmem_cache_create("cifs_request",
872 CIFSMaxBufSize + 760 CIFSMaxBufSize +
873 MAX_CIFS_HDR_SIZE, 0, 761 MAX_CIFS_HDR_SIZE, 0,
@@ -879,7 +767,7 @@ cifs_init_request_bufs(void)
879 cifs_min_rcv = 1; 767 cifs_min_rcv = 1;
880 else if (cifs_min_rcv > 64) { 768 else if (cifs_min_rcv > 64) {
881 cifs_min_rcv = 64; 769 cifs_min_rcv = 64;
882 cERROR(1, ("cifs_min_rcv set to maximum (64)")); 770 cERROR(1, "cifs_min_rcv set to maximum (64)");
883 } 771 }
884 772
885 cifs_req_poolp = mempool_create_slab_pool(cifs_min_rcv, 773 cifs_req_poolp = mempool_create_slab_pool(cifs_min_rcv,
@@ -910,7 +798,7 @@ cifs_init_request_bufs(void)
910 cifs_min_small = 2; 798 cifs_min_small = 2;
911 else if (cifs_min_small > 256) { 799 else if (cifs_min_small > 256) {
912 cifs_min_small = 256; 800 cifs_min_small = 256;
913 cFYI(1, ("cifs_min_small set to maximum (256)")); 801 cFYI(1, "cifs_min_small set to maximum (256)");
914 } 802 }
915 803
916 cifs_sm_req_poolp = mempool_create_slab_pool(cifs_min_small, 804 cifs_sm_req_poolp = mempool_create_slab_pool(cifs_min_small,
@@ -951,15 +839,6 @@ cifs_init_mids(void)
951 return -ENOMEM; 839 return -ENOMEM;
952 } 840 }
953 841
954 cifs_oplock_cachep = kmem_cache_create("cifs_oplock_structs",
955 sizeof(struct oplock_q_entry), 0,
956 SLAB_HWCACHE_ALIGN, NULL);
957 if (cifs_oplock_cachep == NULL) {
958 mempool_destroy(cifs_mid_poolp);
959 kmem_cache_destroy(cifs_mid_cachep);
960 return -ENOMEM;
961 }
962
963 return 0; 842 return 0;
964} 843}
965 844
@@ -968,7 +847,6 @@ cifs_destroy_mids(void)
968{ 847{
969 mempool_destroy(cifs_mid_poolp); 848 mempool_destroy(cifs_mid_poolp);
970 kmem_cache_destroy(cifs_mid_cachep); 849 kmem_cache_destroy(cifs_mid_cachep);
971 kmem_cache_destroy(cifs_oplock_cachep);
972} 850}
973 851
974static int __init 852static int __init
@@ -1008,10 +886,10 @@ init_cifs(void)
1008 886
1009 if (cifs_max_pending < 2) { 887 if (cifs_max_pending < 2) {
1010 cifs_max_pending = 2; 888 cifs_max_pending = 2;
1011 cFYI(1, ("cifs_max_pending set to min of 2")); 889 cFYI(1, "cifs_max_pending set to min of 2");
1012 } else if (cifs_max_pending > 256) { 890 } else if (cifs_max_pending > 256) {
1013 cifs_max_pending = 256; 891 cifs_max_pending = 256;
1014 cFYI(1, ("cifs_max_pending set to max of 256")); 892 cFYI(1, "cifs_max_pending set to max of 256");
1015 } 893 }
1016 894
1017 rc = cifs_init_inodecache(); 895 rc = cifs_init_inodecache();
@@ -1069,7 +947,7 @@ init_cifs(void)
1069static void __exit 947static void __exit
1070exit_cifs(void) 948exit_cifs(void)
1071{ 949{
1072 cFYI(DBG2, ("exit_cifs")); 950 cFYI(DBG2, "exit_cifs");
1073 cifs_proc_clean(); 951 cifs_proc_clean();
1074#ifdef CONFIG_CIFS_DFS_UPCALL 952#ifdef CONFIG_CIFS_DFS_UPCALL
1075 cifs_dfs_release_automount_timer(); 953 cifs_dfs_release_automount_timer();
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 7aa57ecdc437..a7eb65c84b1c 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -84,7 +84,7 @@ extern ssize_t cifs_user_read(struct file *file, char __user *read_data,
84extern ssize_t cifs_user_write(struct file *file, const char __user *write_data, 84extern ssize_t cifs_user_write(struct file *file, const char __user *write_data,
85 size_t write_size, loff_t *poffset); 85 size_t write_size, loff_t *poffset);
86extern int cifs_lock(struct file *, int, struct file_lock *); 86extern int cifs_lock(struct file *, int, struct file_lock *);
87extern int cifs_fsync(struct file *, struct dentry *, int); 87extern int cifs_fsync(struct file *, int);
88extern int cifs_flush(struct file *, fl_owner_t id); 88extern int cifs_flush(struct file *, fl_owner_t id);
89extern int cifs_file_mmap(struct file * , struct vm_area_struct *); 89extern int cifs_file_mmap(struct file * , struct vm_area_struct *);
90extern const struct file_operations cifs_dir_ops; 90extern const struct file_operations cifs_dir_ops;
@@ -114,5 +114,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
114extern const struct export_operations cifs_export_ops; 114extern const struct export_operations cifs_export_ops;
115#endif /* EXPERIMENTAL */ 115#endif /* EXPERIMENTAL */
116 116
117#define CIFS_VERSION "1.62" 117#define CIFS_VERSION "1.64"
118#endif /* _CIFSFS_H */ 118#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 63c89d1d70b5..a88479ceaad5 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -18,6 +18,7 @@
18 */ 18 */
19#include <linux/in.h> 19#include <linux/in.h>
20#include <linux/in6.h> 20#include <linux/in6.h>
21#include <linux/slab.h>
21#include <linux/slow-work.h> 22#include <linux/slow-work.h>
22#include "cifs_fs_sb.h" 23#include "cifs_fs_sb.h"
23#include "cifsacl.h" 24#include "cifsacl.h"
@@ -86,7 +87,6 @@ enum securityEnum {
86 RawNTLMSSP, /* NTLMSSP without SPNEGO, NTLMv2 hash */ 87 RawNTLMSSP, /* NTLMSSP without SPNEGO, NTLMv2 hash */
87/* NTLMSSP, */ /* can use rawNTLMSSP instead of NTLMSSP via SPNEGO */ 88/* NTLMSSP, */ /* can use rawNTLMSSP instead of NTLMSSP via SPNEGO */
88 Kerberos, /* Kerberos via SPNEGO */ 89 Kerberos, /* Kerberos via SPNEGO */
89 MSKerberos, /* MS Kerberos via SPNEGO */
90}; 90};
91 91
92enum protocolEnum { 92enum protocolEnum {
@@ -184,6 +184,12 @@ struct TCP_Server_Info {
184 struct mac_key mac_signing_key; 184 struct mac_key mac_signing_key;
185 char ntlmv2_hash[16]; 185 char ntlmv2_hash[16];
186 unsigned long lstrp; /* when we got last response from this server */ 186 unsigned long lstrp; /* when we got last response from this server */
187 u16 dialect; /* dialect index that server chose */
188 /* extended security flavors that server supports */
189 bool sec_kerberos; /* supports plain Kerberos */
190 bool sec_mskerberos; /* supports legacy MS Kerberos */
191 bool sec_kerberosu2u; /* supports U2U Kerberos */
192 bool sec_ntlmssp; /* supports NTLMSSP */
187}; 193};
188 194
189/* 195/*
@@ -501,6 +507,7 @@ struct dfs_info3_param {
501#define CIFS_FATTR_DFS_REFERRAL 0x1 507#define CIFS_FATTR_DFS_REFERRAL 0x1
502#define CIFS_FATTR_DELETE_PENDING 0x2 508#define CIFS_FATTR_DELETE_PENDING 0x2
503#define CIFS_FATTR_NEED_REVAL 0x4 509#define CIFS_FATTR_NEED_REVAL 0x4
510#define CIFS_FATTR_INO_COLLISION 0x8
504 511
505struct cifs_fattr { 512struct cifs_fattr {
506 u32 cf_flags; 513 u32 cf_flags;
@@ -716,7 +723,7 @@ GLOBAL_EXTERN unsigned int multiuser_mount; /* if enabled allows new sessions
716GLOBAL_EXTERN unsigned int oplockEnabled; 723GLOBAL_EXTERN unsigned int oplockEnabled;
717GLOBAL_EXTERN unsigned int experimEnabled; 724GLOBAL_EXTERN unsigned int experimEnabled;
718GLOBAL_EXTERN unsigned int lookupCacheEnabled; 725GLOBAL_EXTERN unsigned int lookupCacheEnabled;
719GLOBAL_EXTERN unsigned int extended_security; /* if on, session setup sent 726GLOBAL_EXTERN unsigned int global_secflags; /* if on, session setup sent
720 with more secure ntlmssp2 challenge/resp */ 727 with more secure ntlmssp2 challenge/resp */
721GLOBAL_EXTERN unsigned int sign_CIFS_PDUs; /* enable smb packet signing */ 728GLOBAL_EXTERN unsigned int sign_CIFS_PDUs; /* enable smb packet signing */
722GLOBAL_EXTERN unsigned int linuxExtEnabled;/*enable Linux/Unix CIFS extensions*/ 729GLOBAL_EXTERN unsigned int linuxExtEnabled;/*enable Linux/Unix CIFS extensions*/
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 39e47f46dea5..fb1657e0fdb8 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -39,8 +39,20 @@ extern int smb_send(struct TCP_Server_Info *, struct smb_hdr *,
39 unsigned int /* length */); 39 unsigned int /* length */);
40extern unsigned int _GetXid(void); 40extern unsigned int _GetXid(void);
41extern void _FreeXid(unsigned int); 41extern void _FreeXid(unsigned int);
42#define GetXid() (int)_GetXid(); cFYI(1,("CIFS VFS: in %s as Xid: %d with uid: %d",__func__, xid,current_fsuid())); 42#define GetXid() \
43#define FreeXid(curr_xid) {_FreeXid(curr_xid); cFYI(1,("CIFS VFS: leaving %s (xid = %d) rc = %d",__func__,curr_xid,(int)rc));} 43({ \
44 int __xid = (int)_GetXid(); \
45 cFYI(1, "CIFS VFS: in %s as Xid: %d with uid: %d", \
46 __func__, __xid, current_fsuid()); \
47 __xid; \
48})
49
50#define FreeXid(curr_xid) \
51do { \
52 _FreeXid(curr_xid); \
53 cFYI(1, "CIFS VFS: leaving %s (xid = %d) rc = %d", \
54 __func__, curr_xid, (int)rc); \
55} while (0)
44extern char *build_path_from_dentry(struct dentry *); 56extern char *build_path_from_dentry(struct dentry *);
45extern char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb); 57extern char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb);
46extern char *build_wildcard_path_from_dentry(struct dentry *direntry); 58extern char *build_wildcard_path_from_dentry(struct dentry *direntry);
@@ -73,7 +85,7 @@ extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *);
73extern unsigned int smbCalcSize(struct smb_hdr *ptr); 85extern unsigned int smbCalcSize(struct smb_hdr *ptr);
74extern unsigned int smbCalcSize_LE(struct smb_hdr *ptr); 86extern unsigned int smbCalcSize_LE(struct smb_hdr *ptr);
75extern int decode_negTokenInit(unsigned char *security_blob, int length, 87extern int decode_negTokenInit(unsigned char *security_blob, int length,
76 enum securityEnum *secType); 88 struct TCP_Server_Info *server);
77extern int cifs_convert_address(char *src, void *dst); 89extern int cifs_convert_address(char *src, void *dst);
78extern int map_smb_to_linux_error(struct smb_hdr *smb, int logErr); 90extern int map_smb_to_linux_error(struct smb_hdr *smb, int logErr);
79extern void header_assemble(struct smb_hdr *, char /* command */ , 91extern void header_assemble(struct smb_hdr *, char /* command */ ,
@@ -83,7 +95,6 @@ extern int small_smb_init_no_tc(const int smb_cmd, const int wct,
83 struct cifsSesInfo *ses, 95 struct cifsSesInfo *ses,
84 void **request_buf); 96 void **request_buf);
85extern int CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, 97extern int CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
86 const int stage,
87 const struct nls_table *nls_cp); 98 const struct nls_table *nls_cp);
88extern __u16 GetNextMid(struct TCP_Server_Info *server); 99extern __u16 GetNextMid(struct TCP_Server_Info *server);
89extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601); 100extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601);
@@ -95,8 +106,11 @@ extern struct cifsFileInfo *cifs_new_fileinfo(struct inode *newinode,
95 __u16 fileHandle, struct file *file, 106 __u16 fileHandle, struct file *file,
96 struct vfsmount *mnt, unsigned int oflags); 107 struct vfsmount *mnt, unsigned int oflags);
97extern int cifs_posix_open(char *full_path, struct inode **pinode, 108extern int cifs_posix_open(char *full_path, struct inode **pinode,
98 struct vfsmount *mnt, int mode, int oflags, 109 struct vfsmount *mnt,
99 __u32 *poplock, __u16 *pnetfid, int xid); 110 struct super_block *sb,
111 int mode, int oflags,
112 __u32 *poplock, __u16 *pnetfid, int xid);
113void cifs_fill_uniqueid(struct super_block *sb, struct cifs_fattr *fattr);
100extern void cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, 114extern void cifs_unix_basic_to_fattr(struct cifs_fattr *fattr,
101 FILE_UNIX_BASIC_INFO *info, 115 FILE_UNIX_BASIC_INFO *info,
102 struct cifs_sb_info *cifs_sb); 116 struct cifs_sb_info *cifs_sb);
@@ -125,7 +139,9 @@ extern void cifs_dfs_release_automount_timer(void);
125void cifs_proc_init(void); 139void cifs_proc_init(void);
126void cifs_proc_clean(void); 140void cifs_proc_clean(void);
127 141
128extern int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo, 142extern int cifs_negotiate_protocol(unsigned int xid,
143 struct cifsSesInfo *ses);
144extern int cifs_setup_session(unsigned int xid, struct cifsSesInfo *ses,
129 struct nls_table *nls_info); 145 struct nls_table *nls_info);
130extern int CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses); 146extern int CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses);
131 147
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 7cc7f83e9314..c65c3419dd37 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * fs/cifs/cifssmb.c 2 * fs/cifs/cifssmb.c
3 * 3 *
4 * Copyright (C) International Business Machines Corp., 2002,2009 4 * Copyright (C) International Business Machines Corp., 2002,2010
5 * Author(s): Steve French (sfrench@us.ibm.com) 5 * Author(s): Steve French (sfrench@us.ibm.com)
6 * 6 *
7 * Contains the routines for constructing the SMB PDUs themselves 7 * Contains the routines for constructing the SMB PDUs themselves
@@ -30,6 +30,7 @@
30#include <linux/fs.h> 30#include <linux/fs.h>
31#include <linux/kernel.h> 31#include <linux/kernel.h>
32#include <linux/vfs.h> 32#include <linux/vfs.h>
33#include <linux/slab.h>
33#include <linux/posix_acl_xattr.h> 34#include <linux/posix_acl_xattr.h>
34#include <asm/uaccess.h> 35#include <asm/uaccess.h>
35#include "cifspdu.h" 36#include "cifspdu.h"
@@ -129,8 +130,8 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
129 if (smb_command != SMB_COM_WRITE_ANDX && 130 if (smb_command != SMB_COM_WRITE_ANDX &&
130 smb_command != SMB_COM_OPEN_ANDX && 131 smb_command != SMB_COM_OPEN_ANDX &&
131 smb_command != SMB_COM_TREE_DISCONNECT) { 132 smb_command != SMB_COM_TREE_DISCONNECT) {
132 cFYI(1, ("can not send cmd %d while umounting", 133 cFYI(1, "can not send cmd %d while umounting",
133 smb_command)); 134 smb_command);
134 return -ENODEV; 135 return -ENODEV;
135 } 136 }
136 } 137 }
@@ -156,7 +157,7 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
156 * back on-line 157 * back on-line
157 */ 158 */
158 if (!tcon->retry || ses->status == CifsExiting) { 159 if (!tcon->retry || ses->status == CifsExiting) {
159 cFYI(1, ("gave up waiting on reconnect in smb_init")); 160 cFYI(1, "gave up waiting on reconnect in smb_init");
160 return -EHOSTDOWN; 161 return -EHOSTDOWN;
161 } 162 }
162 } 163 }
@@ -171,7 +172,8 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
171 * reconnect the same SMB session 172 * reconnect the same SMB session
172 */ 173 */
173 mutex_lock(&ses->session_mutex); 174 mutex_lock(&ses->session_mutex);
174 if (ses->need_reconnect) 175 rc = cifs_negotiate_protocol(0, ses);
176 if (rc == 0 && ses->need_reconnect)
175 rc = cifs_setup_session(0, ses, nls_codepage); 177 rc = cifs_setup_session(0, ses, nls_codepage);
176 178
177 /* do we need to reconnect tcon? */ 179 /* do we need to reconnect tcon? */
@@ -183,7 +185,7 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
183 mark_open_files_invalid(tcon); 185 mark_open_files_invalid(tcon);
184 rc = CIFSTCon(0, ses, tcon->treeName, tcon, nls_codepage); 186 rc = CIFSTCon(0, ses, tcon->treeName, tcon, nls_codepage);
185 mutex_unlock(&ses->session_mutex); 187 mutex_unlock(&ses->session_mutex);
186 cFYI(1, ("reconnect tcon rc = %d", rc)); 188 cFYI(1, "reconnect tcon rc = %d", rc);
187 189
188 if (rc) 190 if (rc)
189 goto out; 191 goto out;
@@ -354,7 +356,6 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
354 struct TCP_Server_Info *server; 356 struct TCP_Server_Info *server;
355 u16 count; 357 u16 count;
356 unsigned int secFlags; 358 unsigned int secFlags;
357 u16 dialect;
358 359
359 if (ses->server) 360 if (ses->server)
360 server = ses->server; 361 server = ses->server;
@@ -371,9 +372,9 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
371 if (ses->overrideSecFlg & (~(CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL))) 372 if (ses->overrideSecFlg & (~(CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL)))
372 secFlags = ses->overrideSecFlg; /* BB FIXME fix sign flags? */ 373 secFlags = ses->overrideSecFlg; /* BB FIXME fix sign flags? */
373 else /* if override flags set only sign/seal OR them with global auth */ 374 else /* if override flags set only sign/seal OR them with global auth */
374 secFlags = extended_security | ses->overrideSecFlg; 375 secFlags = global_secflags | ses->overrideSecFlg;
375 376
376 cFYI(1, ("secFlags 0x%x", secFlags)); 377 cFYI(1, "secFlags 0x%x", secFlags);
377 378
378 pSMB->hdr.Mid = GetNextMid(server); 379 pSMB->hdr.Mid = GetNextMid(server);
379 pSMB->hdr.Flags2 |= (SMBFLG2_UNICODE | SMBFLG2_ERR_STATUS); 380 pSMB->hdr.Flags2 |= (SMBFLG2_UNICODE | SMBFLG2_ERR_STATUS);
@@ -381,14 +382,14 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
381 if ((secFlags & CIFSSEC_MUST_KRB5) == CIFSSEC_MUST_KRB5) 382 if ((secFlags & CIFSSEC_MUST_KRB5) == CIFSSEC_MUST_KRB5)
382 pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC; 383 pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
383 else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_KRB5) { 384 else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_KRB5) {
384 cFYI(1, ("Kerberos only mechanism, enable extended security")); 385 cFYI(1, "Kerberos only mechanism, enable extended security");
385 pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC; 386 pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
386 } 387 }
387#ifdef CONFIG_CIFS_EXPERIMENTAL 388#ifdef CONFIG_CIFS_EXPERIMENTAL
388 else if ((secFlags & CIFSSEC_MUST_NTLMSSP) == CIFSSEC_MUST_NTLMSSP) 389 else if ((secFlags & CIFSSEC_MUST_NTLMSSP) == CIFSSEC_MUST_NTLMSSP)
389 pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC; 390 pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
390 else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_NTLMSSP) { 391 else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_NTLMSSP) {
391 cFYI(1, ("NTLMSSP only mechanism, enable extended security")); 392 cFYI(1, "NTLMSSP only mechanism, enable extended security");
392 pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC; 393 pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
393 } 394 }
394#endif 395#endif
@@ -407,10 +408,10 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
407 if (rc != 0) 408 if (rc != 0)
408 goto neg_err_exit; 409 goto neg_err_exit;
409 410
410 dialect = le16_to_cpu(pSMBr->DialectIndex); 411 server->dialect = le16_to_cpu(pSMBr->DialectIndex);
411 cFYI(1, ("Dialect: %d", dialect)); 412 cFYI(1, "Dialect: %d", server->dialect);
412 /* Check wct = 1 error case */ 413 /* Check wct = 1 error case */
413 if ((pSMBr->hdr.WordCount < 13) || (dialect == BAD_PROT)) { 414 if ((pSMBr->hdr.WordCount < 13) || (server->dialect == BAD_PROT)) {
414 /* core returns wct = 1, but we do not ask for core - otherwise 415 /* core returns wct = 1, but we do not ask for core - otherwise
415 small wct just comes when dialect index is -1 indicating we 416 small wct just comes when dialect index is -1 indicating we
416 could not negotiate a common dialect */ 417 could not negotiate a common dialect */
@@ -418,8 +419,8 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
418 goto neg_err_exit; 419 goto neg_err_exit;
419#ifdef CONFIG_CIFS_WEAK_PW_HASH 420#ifdef CONFIG_CIFS_WEAK_PW_HASH
420 } else if ((pSMBr->hdr.WordCount == 13) 421 } else if ((pSMBr->hdr.WordCount == 13)
421 && ((dialect == LANMAN_PROT) 422 && ((server->dialect == LANMAN_PROT)
422 || (dialect == LANMAN2_PROT))) { 423 || (server->dialect == LANMAN2_PROT))) {
423 __s16 tmp; 424 __s16 tmp;
424 struct lanman_neg_rsp *rsp = (struct lanman_neg_rsp *)pSMBr; 425 struct lanman_neg_rsp *rsp = (struct lanman_neg_rsp *)pSMBr;
425 426
@@ -427,8 +428,8 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
427 (secFlags & CIFSSEC_MAY_PLNTXT)) 428 (secFlags & CIFSSEC_MAY_PLNTXT))
428 server->secType = LANMAN; 429 server->secType = LANMAN;
429 else { 430 else {
430 cERROR(1, ("mount failed weak security disabled" 431 cERROR(1, "mount failed weak security disabled"
431 " in /proc/fs/cifs/SecurityFlags")); 432 " in /proc/fs/cifs/SecurityFlags");
432 rc = -EOPNOTSUPP; 433 rc = -EOPNOTSUPP;
433 goto neg_err_exit; 434 goto neg_err_exit;
434 } 435 }
@@ -461,9 +462,9 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
461 utc = CURRENT_TIME; 462 utc = CURRENT_TIME;
462 ts = cnvrtDosUnixTm(rsp->SrvTime.Date, 463 ts = cnvrtDosUnixTm(rsp->SrvTime.Date,
463 rsp->SrvTime.Time, 0); 464 rsp->SrvTime.Time, 0);
464 cFYI(1, ("SrvTime %d sec since 1970 (utc: %d) diff: %d", 465 cFYI(1, "SrvTime %d sec since 1970 (utc: %d) diff: %d",
465 (int)ts.tv_sec, (int)utc.tv_sec, 466 (int)ts.tv_sec, (int)utc.tv_sec,
466 (int)(utc.tv_sec - ts.tv_sec))); 467 (int)(utc.tv_sec - ts.tv_sec));
467 val = (int)(utc.tv_sec - ts.tv_sec); 468 val = (int)(utc.tv_sec - ts.tv_sec);
468 seconds = abs(val); 469 seconds = abs(val);
469 result = (seconds / MIN_TZ_ADJ) * MIN_TZ_ADJ; 470 result = (seconds / MIN_TZ_ADJ) * MIN_TZ_ADJ;
@@ -477,7 +478,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
477 server->timeAdj = (int)tmp; 478 server->timeAdj = (int)tmp;
478 server->timeAdj *= 60; /* also in seconds */ 479 server->timeAdj *= 60; /* also in seconds */
479 } 480 }
480 cFYI(1, ("server->timeAdj: %d seconds", server->timeAdj)); 481 cFYI(1, "server->timeAdj: %d seconds", server->timeAdj);
481 482
482 483
483 /* BB get server time for time conversions and add 484 /* BB get server time for time conversions and add
@@ -492,14 +493,14 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
492 goto neg_err_exit; 493 goto neg_err_exit;
493 } 494 }
494 495
495 cFYI(1, ("LANMAN negotiated")); 496 cFYI(1, "LANMAN negotiated");
496 /* we will not end up setting signing flags - as no signing 497 /* we will not end up setting signing flags - as no signing
497 was in LANMAN and server did not return the flags on */ 498 was in LANMAN and server did not return the flags on */
498 goto signing_check; 499 goto signing_check;
499#else /* weak security disabled */ 500#else /* weak security disabled */
500 } else if (pSMBr->hdr.WordCount == 13) { 501 } else if (pSMBr->hdr.WordCount == 13) {
501 cERROR(1, ("mount failed, cifs module not built " 502 cERROR(1, "mount failed, cifs module not built "
502 "with CIFS_WEAK_PW_HASH support")); 503 "with CIFS_WEAK_PW_HASH support");
503 rc = -EOPNOTSUPP; 504 rc = -EOPNOTSUPP;
504#endif /* WEAK_PW_HASH */ 505#endif /* WEAK_PW_HASH */
505 goto neg_err_exit; 506 goto neg_err_exit;
@@ -511,14 +512,14 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
511 /* else wct == 17 NTLM */ 512 /* else wct == 17 NTLM */
512 server->secMode = pSMBr->SecurityMode; 513 server->secMode = pSMBr->SecurityMode;
513 if ((server->secMode & SECMODE_USER) == 0) 514 if ((server->secMode & SECMODE_USER) == 0)
514 cFYI(1, ("share mode security")); 515 cFYI(1, "share mode security");
515 516
516 if ((server->secMode & SECMODE_PW_ENCRYPT) == 0) 517 if ((server->secMode & SECMODE_PW_ENCRYPT) == 0)
517#ifdef CONFIG_CIFS_WEAK_PW_HASH 518#ifdef CONFIG_CIFS_WEAK_PW_HASH
518 if ((secFlags & CIFSSEC_MAY_PLNTXT) == 0) 519 if ((secFlags & CIFSSEC_MAY_PLNTXT) == 0)
519#endif /* CIFS_WEAK_PW_HASH */ 520#endif /* CIFS_WEAK_PW_HASH */
520 cERROR(1, ("Server requests plain text password" 521 cERROR(1, "Server requests plain text password"
521 " but client support disabled")); 522 " but client support disabled");
522 523
523 if ((secFlags & CIFSSEC_MUST_NTLMV2) == CIFSSEC_MUST_NTLMV2) 524 if ((secFlags & CIFSSEC_MUST_NTLMV2) == CIFSSEC_MUST_NTLMV2)
524 server->secType = NTLMv2; 525 server->secType = NTLMv2;
@@ -538,7 +539,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
538#endif */ 539#endif */
539 else { 540 else {
540 rc = -EOPNOTSUPP; 541 rc = -EOPNOTSUPP;
541 cERROR(1, ("Invalid security type")); 542 cERROR(1, "Invalid security type");
542 goto neg_err_exit; 543 goto neg_err_exit;
543 } 544 }
544 /* else ... any others ...? */ 545 /* else ... any others ...? */
@@ -550,7 +551,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
550 server->maxBuf = min(le32_to_cpu(pSMBr->MaxBufferSize), 551 server->maxBuf = min(le32_to_cpu(pSMBr->MaxBufferSize),
551 (__u32) CIFSMaxBufSize + MAX_CIFS_HDR_SIZE); 552 (__u32) CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
552 server->max_rw = le32_to_cpu(pSMBr->MaxRawSize); 553 server->max_rw = le32_to_cpu(pSMBr->MaxRawSize);
553 cFYI(DBG2, ("Max buf = %d", ses->server->maxBuf)); 554 cFYI(DBG2, "Max buf = %d", ses->server->maxBuf);
554 GETU32(ses->server->sessid) = le32_to_cpu(pSMBr->SessionKey); 555 GETU32(ses->server->sessid) = le32_to_cpu(pSMBr->SessionKey);
555 server->capabilities = le32_to_cpu(pSMBr->Capabilities); 556 server->capabilities = le32_to_cpu(pSMBr->Capabilities);
556 server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone); 557 server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone);
@@ -581,7 +582,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
581 if (memcmp(server->server_GUID, 582 if (memcmp(server->server_GUID,
582 pSMBr->u.extended_response. 583 pSMBr->u.extended_response.
583 GUID, 16) != 0) { 584 GUID, 16) != 0) {
584 cFYI(1, ("server UID changed")); 585 cFYI(1, "server UID changed");
585 memcpy(server->server_GUID, 586 memcpy(server->server_GUID,
586 pSMBr->u.extended_response.GUID, 587 pSMBr->u.extended_response.GUID,
587 16); 588 16);
@@ -596,13 +597,19 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
596 server->secType = RawNTLMSSP; 597 server->secType = RawNTLMSSP;
597 } else { 598 } else {
598 rc = decode_negTokenInit(pSMBr->u.extended_response. 599 rc = decode_negTokenInit(pSMBr->u.extended_response.
599 SecurityBlob, 600 SecurityBlob, count - 16,
600 count - 16, 601 server);
601 &server->secType);
602 if (rc == 1) 602 if (rc == 1)
603 rc = 0; 603 rc = 0;
604 else 604 else
605 rc = -EINVAL; 605 rc = -EINVAL;
606
607 if (server->sec_kerberos || server->sec_mskerberos)
608 server->secType = Kerberos;
609 else if (server->sec_ntlmssp)
610 server->secType = RawNTLMSSP;
611 else
612 rc = -EOPNOTSUPP;
606 } 613 }
607 } else 614 } else
608 server->capabilities &= ~CAP_EXTENDED_SECURITY; 615 server->capabilities &= ~CAP_EXTENDED_SECURITY;
@@ -613,22 +620,21 @@ signing_check:
613 if ((secFlags & CIFSSEC_MAY_SIGN) == 0) { 620 if ((secFlags & CIFSSEC_MAY_SIGN) == 0) {
614 /* MUST_SIGN already includes the MAY_SIGN FLAG 621 /* MUST_SIGN already includes the MAY_SIGN FLAG
615 so if this is zero it means that signing is disabled */ 622 so if this is zero it means that signing is disabled */
616 cFYI(1, ("Signing disabled")); 623 cFYI(1, "Signing disabled");
617 if (server->secMode & SECMODE_SIGN_REQUIRED) { 624 if (server->secMode & SECMODE_SIGN_REQUIRED) {
618 cERROR(1, ("Server requires " 625 cERROR(1, "Server requires "
619 "packet signing to be enabled in " 626 "packet signing to be enabled in "
620 "/proc/fs/cifs/SecurityFlags.")); 627 "/proc/fs/cifs/SecurityFlags.");
621 rc = -EOPNOTSUPP; 628 rc = -EOPNOTSUPP;
622 } 629 }
623 server->secMode &= 630 server->secMode &=
624 ~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED); 631 ~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED);
625 } else if ((secFlags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) { 632 } else if ((secFlags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) {
626 /* signing required */ 633 /* signing required */
627 cFYI(1, ("Must sign - secFlags 0x%x", secFlags)); 634 cFYI(1, "Must sign - secFlags 0x%x", secFlags);
628 if ((server->secMode & 635 if ((server->secMode &
629 (SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED)) == 0) { 636 (SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED)) == 0) {
630 cERROR(1, 637 cERROR(1, "signing required but server lacks support");
631 ("signing required but server lacks support"));
632 rc = -EOPNOTSUPP; 638 rc = -EOPNOTSUPP;
633 } else 639 } else
634 server->secMode |= SECMODE_SIGN_REQUIRED; 640 server->secMode |= SECMODE_SIGN_REQUIRED;
@@ -642,7 +648,7 @@ signing_check:
642neg_err_exit: 648neg_err_exit:
643 cifs_buf_release(pSMB); 649 cifs_buf_release(pSMB);
644 650
645 cFYI(1, ("negprot rc %d", rc)); 651 cFYI(1, "negprot rc %d", rc);
646 return rc; 652 return rc;
647} 653}
648 654
@@ -652,7 +658,7 @@ CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon)
652 struct smb_hdr *smb_buffer; 658 struct smb_hdr *smb_buffer;
653 int rc = 0; 659 int rc = 0;
654 660
655 cFYI(1, ("In tree disconnect")); 661 cFYI(1, "In tree disconnect");
656 662
657 /* BB: do we need to check this? These should never be NULL. */ 663 /* BB: do we need to check this? These should never be NULL. */
658 if ((tcon->ses == NULL) || (tcon->ses->server == NULL)) 664 if ((tcon->ses == NULL) || (tcon->ses->server == NULL))
@@ -674,7 +680,7 @@ CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon)
674 680
675 rc = SendReceiveNoRsp(xid, tcon->ses, smb_buffer, 0); 681 rc = SendReceiveNoRsp(xid, tcon->ses, smb_buffer, 0);
676 if (rc) 682 if (rc)
677 cFYI(1, ("Tree disconnect failed %d", rc)); 683 cFYI(1, "Tree disconnect failed %d", rc);
678 684
679 /* No need to return error on this operation if tid invalidated and 685 /* No need to return error on this operation if tid invalidated and
680 closed on server already e.g. due to tcp session crashing */ 686 closed on server already e.g. due to tcp session crashing */
@@ -690,7 +696,7 @@ CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses)
690 LOGOFF_ANDX_REQ *pSMB; 696 LOGOFF_ANDX_REQ *pSMB;
691 int rc = 0; 697 int rc = 0;
692 698
693 cFYI(1, ("In SMBLogoff for session disconnect")); 699 cFYI(1, "In SMBLogoff for session disconnect");
694 700
695 /* 701 /*
696 * BB: do we need to check validity of ses and server? They should 702 * BB: do we need to check validity of ses and server? They should
@@ -743,7 +749,7 @@ CIFSPOSIXDelFile(const int xid, struct cifsTconInfo *tcon, const char *fileName,
743 int bytes_returned = 0; 749 int bytes_returned = 0;
744 __u16 params, param_offset, offset, byte_count; 750 __u16 params, param_offset, offset, byte_count;
745 751
746 cFYI(1, ("In POSIX delete")); 752 cFYI(1, "In POSIX delete");
747PsxDelete: 753PsxDelete:
748 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 754 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
749 (void **) &pSMBr); 755 (void **) &pSMBr);
@@ -795,7 +801,7 @@ PsxDelete:
795 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 801 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
796 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 802 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
797 if (rc) 803 if (rc)
798 cFYI(1, ("Posix delete returned %d", rc)); 804 cFYI(1, "Posix delete returned %d", rc);
799 cifs_buf_release(pSMB); 805 cifs_buf_release(pSMB);
800 806
801 cifs_stats_inc(&tcon->num_deletes); 807 cifs_stats_inc(&tcon->num_deletes);
@@ -842,7 +848,7 @@ DelFileRetry:
842 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 848 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
843 cifs_stats_inc(&tcon->num_deletes); 849 cifs_stats_inc(&tcon->num_deletes);
844 if (rc) 850 if (rc)
845 cFYI(1, ("Error in RMFile = %d", rc)); 851 cFYI(1, "Error in RMFile = %d", rc);
846 852
847 cifs_buf_release(pSMB); 853 cifs_buf_release(pSMB);
848 if (rc == -EAGAIN) 854 if (rc == -EAGAIN)
@@ -861,7 +867,7 @@ CIFSSMBRmDir(const int xid, struct cifsTconInfo *tcon, const char *dirName,
861 int bytes_returned; 867 int bytes_returned;
862 int name_len; 868 int name_len;
863 869
864 cFYI(1, ("In CIFSSMBRmDir")); 870 cFYI(1, "In CIFSSMBRmDir");
865RmDirRetry: 871RmDirRetry:
866 rc = smb_init(SMB_COM_DELETE_DIRECTORY, 0, tcon, (void **) &pSMB, 872 rc = smb_init(SMB_COM_DELETE_DIRECTORY, 0, tcon, (void **) &pSMB,
867 (void **) &pSMBr); 873 (void **) &pSMBr);
@@ -886,7 +892,7 @@ RmDirRetry:
886 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 892 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
887 cifs_stats_inc(&tcon->num_rmdirs); 893 cifs_stats_inc(&tcon->num_rmdirs);
888 if (rc) 894 if (rc)
889 cFYI(1, ("Error in RMDir = %d", rc)); 895 cFYI(1, "Error in RMDir = %d", rc);
890 896
891 cifs_buf_release(pSMB); 897 cifs_buf_release(pSMB);
892 if (rc == -EAGAIN) 898 if (rc == -EAGAIN)
@@ -904,7 +910,7 @@ CIFSSMBMkDir(const int xid, struct cifsTconInfo *tcon,
904 int bytes_returned; 910 int bytes_returned;
905 int name_len; 911 int name_len;
906 912
907 cFYI(1, ("In CIFSSMBMkDir")); 913 cFYI(1, "In CIFSSMBMkDir");
908MkDirRetry: 914MkDirRetry:
909 rc = smb_init(SMB_COM_CREATE_DIRECTORY, 0, tcon, (void **) &pSMB, 915 rc = smb_init(SMB_COM_CREATE_DIRECTORY, 0, tcon, (void **) &pSMB,
910 (void **) &pSMBr); 916 (void **) &pSMBr);
@@ -929,7 +935,7 @@ MkDirRetry:
929 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 935 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
930 cifs_stats_inc(&tcon->num_mkdirs); 936 cifs_stats_inc(&tcon->num_mkdirs);
931 if (rc) 937 if (rc)
932 cFYI(1, ("Error in Mkdir = %d", rc)); 938 cFYI(1, "Error in Mkdir = %d", rc);
933 939
934 cifs_buf_release(pSMB); 940 cifs_buf_release(pSMB);
935 if (rc == -EAGAIN) 941 if (rc == -EAGAIN)
@@ -952,7 +958,7 @@ CIFSPOSIXCreate(const int xid, struct cifsTconInfo *tcon, __u32 posix_flags,
952 OPEN_PSX_REQ *pdata; 958 OPEN_PSX_REQ *pdata;
953 OPEN_PSX_RSP *psx_rsp; 959 OPEN_PSX_RSP *psx_rsp;
954 960
955 cFYI(1, ("In POSIX Create")); 961 cFYI(1, "In POSIX Create");
956PsxCreat: 962PsxCreat:
957 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 963 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
958 (void **) &pSMBr); 964 (void **) &pSMBr);
@@ -1006,11 +1012,11 @@ PsxCreat:
1006 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 1012 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
1007 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 1013 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
1008 if (rc) { 1014 if (rc) {
1009 cFYI(1, ("Posix create returned %d", rc)); 1015 cFYI(1, "Posix create returned %d", rc);
1010 goto psx_create_err; 1016 goto psx_create_err;
1011 } 1017 }
1012 1018
1013 cFYI(1, ("copying inode info")); 1019 cFYI(1, "copying inode info");
1014 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 1020 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
1015 1021
1016 if (rc || (pSMBr->ByteCount < sizeof(OPEN_PSX_RSP))) { 1022 if (rc || (pSMBr->ByteCount < sizeof(OPEN_PSX_RSP))) {
@@ -1032,11 +1038,11 @@ PsxCreat:
1032 /* check to make sure response data is there */ 1038 /* check to make sure response data is there */
1033 if (psx_rsp->ReturnedLevel != cpu_to_le16(SMB_QUERY_FILE_UNIX_BASIC)) { 1039 if (psx_rsp->ReturnedLevel != cpu_to_le16(SMB_QUERY_FILE_UNIX_BASIC)) {
1034 pRetData->Type = cpu_to_le32(-1); /* unknown */ 1040 pRetData->Type = cpu_to_le32(-1); /* unknown */
1035 cFYI(DBG2, ("unknown type")); 1041 cFYI(DBG2, "unknown type");
1036 } else { 1042 } else {
1037 if (pSMBr->ByteCount < sizeof(OPEN_PSX_RSP) 1043 if (pSMBr->ByteCount < sizeof(OPEN_PSX_RSP)
1038 + sizeof(FILE_UNIX_BASIC_INFO)) { 1044 + sizeof(FILE_UNIX_BASIC_INFO)) {
1039 cERROR(1, ("Open response data too small")); 1045 cERROR(1, "Open response data too small");
1040 pRetData->Type = cpu_to_le32(-1); 1046 pRetData->Type = cpu_to_le32(-1);
1041 goto psx_create_err; 1047 goto psx_create_err;
1042 } 1048 }
@@ -1083,7 +1089,7 @@ static __u16 convert_disposition(int disposition)
1083 ofun = SMBOPEN_OCREATE | SMBOPEN_OTRUNC; 1089 ofun = SMBOPEN_OCREATE | SMBOPEN_OTRUNC;
1084 break; 1090 break;
1085 default: 1091 default:
1086 cFYI(1, ("unknown disposition %d", disposition)); 1092 cFYI(1, "unknown disposition %d", disposition);
1087 ofun = SMBOPEN_OAPPEND; /* regular open */ 1093 ofun = SMBOPEN_OAPPEND; /* regular open */
1088 } 1094 }
1089 return ofun; 1095 return ofun;
@@ -1174,7 +1180,7 @@ OldOpenRetry:
1174 (struct smb_hdr *)pSMBr, &bytes_returned, CIFS_LONG_OP); 1180 (struct smb_hdr *)pSMBr, &bytes_returned, CIFS_LONG_OP);
1175 cifs_stats_inc(&tcon->num_opens); 1181 cifs_stats_inc(&tcon->num_opens);
1176 if (rc) { 1182 if (rc) {
1177 cFYI(1, ("Error in Open = %d", rc)); 1183 cFYI(1, "Error in Open = %d", rc);
1178 } else { 1184 } else {
1179 /* BB verify if wct == 15 */ 1185 /* BB verify if wct == 15 */
1180 1186
@@ -1287,7 +1293,7 @@ openRetry:
1287 (struct smb_hdr *)pSMBr, &bytes_returned, CIFS_LONG_OP); 1293 (struct smb_hdr *)pSMBr, &bytes_returned, CIFS_LONG_OP);
1288 cifs_stats_inc(&tcon->num_opens); 1294 cifs_stats_inc(&tcon->num_opens);
1289 if (rc) { 1295 if (rc) {
1290 cFYI(1, ("Error in Open = %d", rc)); 1296 cFYI(1, "Error in Open = %d", rc);
1291 } else { 1297 } else {
1292 *pOplock = pSMBr->OplockLevel; /* 1 byte no need to le_to_cpu */ 1298 *pOplock = pSMBr->OplockLevel; /* 1 byte no need to le_to_cpu */
1293 *netfid = pSMBr->Fid; /* cifs fid stays in le */ 1299 *netfid = pSMBr->Fid; /* cifs fid stays in le */
@@ -1325,7 +1331,7 @@ CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, const int netfid,
1325 int resp_buf_type = 0; 1331 int resp_buf_type = 0;
1326 struct kvec iov[1]; 1332 struct kvec iov[1];
1327 1333
1328 cFYI(1, ("Reading %d bytes on fid %d", count, netfid)); 1334 cFYI(1, "Reading %d bytes on fid %d", count, netfid);
1329 if (tcon->ses->capabilities & CAP_LARGE_FILES) 1335 if (tcon->ses->capabilities & CAP_LARGE_FILES)
1330 wct = 12; 1336 wct = 12;
1331 else { 1337 else {
@@ -1370,7 +1376,7 @@ CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, const int netfid,
1370 cifs_stats_inc(&tcon->num_reads); 1376 cifs_stats_inc(&tcon->num_reads);
1371 pSMBr = (READ_RSP *)iov[0].iov_base; 1377 pSMBr = (READ_RSP *)iov[0].iov_base;
1372 if (rc) { 1378 if (rc) {
1373 cERROR(1, ("Send error in read = %d", rc)); 1379 cERROR(1, "Send error in read = %d", rc);
1374 } else { 1380 } else {
1375 int data_length = le16_to_cpu(pSMBr->DataLengthHigh); 1381 int data_length = le16_to_cpu(pSMBr->DataLengthHigh);
1376 data_length = data_length << 16; 1382 data_length = data_length << 16;
@@ -1380,15 +1386,15 @@ CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, const int netfid,
1380 /*check that DataLength would not go beyond end of SMB */ 1386 /*check that DataLength would not go beyond end of SMB */
1381 if ((data_length > CIFSMaxBufSize) 1387 if ((data_length > CIFSMaxBufSize)
1382 || (data_length > count)) { 1388 || (data_length > count)) {
1383 cFYI(1, ("bad length %d for count %d", 1389 cFYI(1, "bad length %d for count %d",
1384 data_length, count)); 1390 data_length, count);
1385 rc = -EIO; 1391 rc = -EIO;
1386 *nbytes = 0; 1392 *nbytes = 0;
1387 } else { 1393 } else {
1388 pReadData = (char *) (&pSMBr->hdr.Protocol) + 1394 pReadData = (char *) (&pSMBr->hdr.Protocol) +
1389 le16_to_cpu(pSMBr->DataOffset); 1395 le16_to_cpu(pSMBr->DataOffset);
1390/* if (rc = copy_to_user(buf, pReadData, data_length)) { 1396/* if (rc = copy_to_user(buf, pReadData, data_length)) {
1391 cERROR(1,("Faulting on read rc = %d",rc)); 1397 cERROR(1, "Faulting on read rc = %d",rc);
1392 rc = -EFAULT; 1398 rc = -EFAULT;
1393 }*/ /* can not use copy_to_user when using page cache*/ 1399 }*/ /* can not use copy_to_user when using page cache*/
1394 if (*buf) 1400 if (*buf)
@@ -1430,7 +1436,9 @@ CIFSSMBWrite(const int xid, struct cifsTconInfo *tcon,
1430 __u32 bytes_sent; 1436 __u32 bytes_sent;
1431 __u16 byte_count; 1437 __u16 byte_count;
1432 1438
1433 /* cFYI(1, ("write at %lld %d bytes", offset, count));*/ 1439 *nbytes = 0;
1440
1441 /* cFYI(1, "write at %lld %d bytes", offset, count);*/
1434 if (tcon->ses == NULL) 1442 if (tcon->ses == NULL)
1435 return -ECONNABORTED; 1443 return -ECONNABORTED;
1436 1444
@@ -1511,12 +1519,19 @@ CIFSSMBWrite(const int xid, struct cifsTconInfo *tcon,
1511 (struct smb_hdr *) pSMBr, &bytes_returned, long_op); 1519 (struct smb_hdr *) pSMBr, &bytes_returned, long_op);
1512 cifs_stats_inc(&tcon->num_writes); 1520 cifs_stats_inc(&tcon->num_writes);
1513 if (rc) { 1521 if (rc) {
1514 cFYI(1, ("Send error in write = %d", rc)); 1522 cFYI(1, "Send error in write = %d", rc);
1515 *nbytes = 0;
1516 } else { 1523 } else {
1517 *nbytes = le16_to_cpu(pSMBr->CountHigh); 1524 *nbytes = le16_to_cpu(pSMBr->CountHigh);
1518 *nbytes = (*nbytes) << 16; 1525 *nbytes = (*nbytes) << 16;
1519 *nbytes += le16_to_cpu(pSMBr->Count); 1526 *nbytes += le16_to_cpu(pSMBr->Count);
1527
1528 /*
1529 * Mask off high 16 bits when bytes written as returned by the
1530 * server is greater than bytes requested by the client. Some
1531 * OS/2 servers are known to set incorrect CountHigh values.
1532 */
1533 if (*nbytes > count)
1534 *nbytes &= 0xFFFF;
1520 } 1535 }
1521 1536
1522 cifs_buf_release(pSMB); 1537 cifs_buf_release(pSMB);
@@ -1541,7 +1556,7 @@ CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon,
1541 1556
1542 *nbytes = 0; 1557 *nbytes = 0;
1543 1558
1544 cFYI(1, ("write2 at %lld %d bytes", (long long)offset, count)); 1559 cFYI(1, "write2 at %lld %d bytes", (long long)offset, count);
1545 1560
1546 if (tcon->ses->capabilities & CAP_LARGE_FILES) { 1561 if (tcon->ses->capabilities & CAP_LARGE_FILES) {
1547 wct = 14; 1562 wct = 14;
@@ -1596,7 +1611,7 @@ CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon,
1596 long_op); 1611 long_op);
1597 cifs_stats_inc(&tcon->num_writes); 1612 cifs_stats_inc(&tcon->num_writes);
1598 if (rc) { 1613 if (rc) {
1599 cFYI(1, ("Send error Write2 = %d", rc)); 1614 cFYI(1, "Send error Write2 = %d", rc);
1600 } else if (resp_buf_type == 0) { 1615 } else if (resp_buf_type == 0) {
1601 /* presumably this can not happen, but best to be safe */ 1616 /* presumably this can not happen, but best to be safe */
1602 rc = -EIO; 1617 rc = -EIO;
@@ -1605,6 +1620,14 @@ CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon,
1605 *nbytes = le16_to_cpu(pSMBr->CountHigh); 1620 *nbytes = le16_to_cpu(pSMBr->CountHigh);
1606 *nbytes = (*nbytes) << 16; 1621 *nbytes = (*nbytes) << 16;
1607 *nbytes += le16_to_cpu(pSMBr->Count); 1622 *nbytes += le16_to_cpu(pSMBr->Count);
1623
1624 /*
1625 * Mask off high 16 bits when bytes written as returned by the
1626 * server is greater than bytes requested by the client. OS/2
1627 * servers are known to set incorrect CountHigh values.
1628 */
1629 if (*nbytes > count)
1630 *nbytes &= 0xFFFF;
1608 } 1631 }
1609 1632
1610/* cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */ 1633/* cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */
@@ -1633,7 +1656,7 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
1633 int timeout = 0; 1656 int timeout = 0;
1634 __u16 count; 1657 __u16 count;
1635 1658
1636 cFYI(1, ("CIFSSMBLock timeout %d numLock %d", (int)waitFlag, numLock)); 1659 cFYI(1, "CIFSSMBLock timeout %d numLock %d", (int)waitFlag, numLock);
1637 rc = small_smb_init(SMB_COM_LOCKING_ANDX, 8, tcon, (void **) &pSMB); 1660 rc = small_smb_init(SMB_COM_LOCKING_ANDX, 8, tcon, (void **) &pSMB);
1638 1661
1639 if (rc) 1662 if (rc)
@@ -1681,7 +1704,7 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
1681 } 1704 }
1682 cifs_stats_inc(&tcon->num_locks); 1705 cifs_stats_inc(&tcon->num_locks);
1683 if (rc) 1706 if (rc)
1684 cFYI(1, ("Send error in Lock = %d", rc)); 1707 cFYI(1, "Send error in Lock = %d", rc);
1685 1708
1686 /* Note: On -EAGAIN error only caller can retry on handle based calls 1709 /* Note: On -EAGAIN error only caller can retry on handle based calls
1687 since file handle passed in no longer valid */ 1710 since file handle passed in no longer valid */
@@ -1704,7 +1727,7 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
1704 __u16 params, param_offset, offset, byte_count, count; 1727 __u16 params, param_offset, offset, byte_count, count;
1705 struct kvec iov[1]; 1728 struct kvec iov[1];
1706 1729
1707 cFYI(1, ("Posix Lock")); 1730 cFYI(1, "Posix Lock");
1708 1731
1709 if (pLockData == NULL) 1732 if (pLockData == NULL)
1710 return -EINVAL; 1733 return -EINVAL;
@@ -1774,7 +1797,7 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
1774 } 1797 }
1775 1798
1776 if (rc) { 1799 if (rc) {
1777 cFYI(1, ("Send error in Posix Lock = %d", rc)); 1800 cFYI(1, "Send error in Posix Lock = %d", rc);
1778 } else if (get_flag) { 1801 } else if (get_flag) {
1779 /* lock structure can be returned on get */ 1802 /* lock structure can be returned on get */
1780 __u16 data_offset; 1803 __u16 data_offset;
@@ -1793,8 +1816,21 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
1793 } 1816 }
1794 parm_data = (struct cifs_posix_lock *) 1817 parm_data = (struct cifs_posix_lock *)
1795 ((char *)&pSMBr->hdr.Protocol + data_offset); 1818 ((char *)&pSMBr->hdr.Protocol + data_offset);
1796 if (parm_data->lock_type == cpu_to_le16(CIFS_UNLCK)) 1819 if (parm_data->lock_type == __constant_cpu_to_le16(CIFS_UNLCK))
1797 pLockData->fl_type = F_UNLCK; 1820 pLockData->fl_type = F_UNLCK;
1821 else {
1822 if (parm_data->lock_type ==
1823 __constant_cpu_to_le16(CIFS_RDLCK))
1824 pLockData->fl_type = F_RDLCK;
1825 else if (parm_data->lock_type ==
1826 __constant_cpu_to_le16(CIFS_WRLCK))
1827 pLockData->fl_type = F_WRLCK;
1828
1829 pLockData->fl_start = parm_data->start;
1830 pLockData->fl_end = parm_data->start +
1831 parm_data->length - 1;
1832 pLockData->fl_pid = parm_data->pid;
1833 }
1798 } 1834 }
1799 1835
1800plk_err_exit: 1836plk_err_exit:
@@ -1818,7 +1854,7 @@ CIFSSMBClose(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
1818{ 1854{
1819 int rc = 0; 1855 int rc = 0;
1820 CLOSE_REQ *pSMB = NULL; 1856 CLOSE_REQ *pSMB = NULL;
1821 cFYI(1, ("In CIFSSMBClose")); 1857 cFYI(1, "In CIFSSMBClose");
1822 1858
1823/* do not retry on dead session on close */ 1859/* do not retry on dead session on close */
1824 rc = small_smb_init(SMB_COM_CLOSE, 3, tcon, (void **) &pSMB); 1860 rc = small_smb_init(SMB_COM_CLOSE, 3, tcon, (void **) &pSMB);
@@ -1835,7 +1871,7 @@ CIFSSMBClose(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
1835 if (rc) { 1871 if (rc) {
1836 if (rc != -EINTR) { 1872 if (rc != -EINTR) {
1837 /* EINTR is expected when user ctl-c to kill app */ 1873 /* EINTR is expected when user ctl-c to kill app */
1838 cERROR(1, ("Send error in Close = %d", rc)); 1874 cERROR(1, "Send error in Close = %d", rc);
1839 } 1875 }
1840 } 1876 }
1841 1877
@@ -1851,7 +1887,7 @@ CIFSSMBFlush(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
1851{ 1887{
1852 int rc = 0; 1888 int rc = 0;
1853 FLUSH_REQ *pSMB = NULL; 1889 FLUSH_REQ *pSMB = NULL;
1854 cFYI(1, ("In CIFSSMBFlush")); 1890 cFYI(1, "In CIFSSMBFlush");
1855 1891
1856 rc = small_smb_init(SMB_COM_FLUSH, 1, tcon, (void **) &pSMB); 1892 rc = small_smb_init(SMB_COM_FLUSH, 1, tcon, (void **) &pSMB);
1857 if (rc) 1893 if (rc)
@@ -1862,7 +1898,7 @@ CIFSSMBFlush(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
1862 rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0); 1898 rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
1863 cifs_stats_inc(&tcon->num_flushes); 1899 cifs_stats_inc(&tcon->num_flushes);
1864 if (rc) 1900 if (rc)
1865 cERROR(1, ("Send error in Flush = %d", rc)); 1901 cERROR(1, "Send error in Flush = %d", rc);
1866 1902
1867 return rc; 1903 return rc;
1868} 1904}
@@ -1879,7 +1915,7 @@ CIFSSMBRename(const int xid, struct cifsTconInfo *tcon,
1879 int name_len, name_len2; 1915 int name_len, name_len2;
1880 __u16 count; 1916 __u16 count;
1881 1917
1882 cFYI(1, ("In CIFSSMBRename")); 1918 cFYI(1, "In CIFSSMBRename");
1883renameRetry: 1919renameRetry:
1884 rc = smb_init(SMB_COM_RENAME, 1, tcon, (void **) &pSMB, 1920 rc = smb_init(SMB_COM_RENAME, 1, tcon, (void **) &pSMB,
1885 (void **) &pSMBr); 1921 (void **) &pSMBr);
@@ -1925,7 +1961,7 @@ renameRetry:
1925 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 1961 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
1926 cifs_stats_inc(&tcon->num_renames); 1962 cifs_stats_inc(&tcon->num_renames);
1927 if (rc) 1963 if (rc)
1928 cFYI(1, ("Send error in rename = %d", rc)); 1964 cFYI(1, "Send error in rename = %d", rc);
1929 1965
1930 cifs_buf_release(pSMB); 1966 cifs_buf_release(pSMB);
1931 1967
@@ -1949,7 +1985,7 @@ int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon,
1949 int len_of_str; 1985 int len_of_str;
1950 __u16 params, param_offset, offset, count, byte_count; 1986 __u16 params, param_offset, offset, count, byte_count;
1951 1987
1952 cFYI(1, ("Rename to File by handle")); 1988 cFYI(1, "Rename to File by handle");
1953 rc = smb_init(SMB_COM_TRANSACTION2, 15, pTcon, (void **) &pSMB, 1989 rc = smb_init(SMB_COM_TRANSACTION2, 15, pTcon, (void **) &pSMB,
1954 (void **) &pSMBr); 1990 (void **) &pSMBr);
1955 if (rc) 1991 if (rc)
@@ -2004,7 +2040,7 @@ int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon,
2004 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 2040 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
2005 cifs_stats_inc(&pTcon->num_t2renames); 2041 cifs_stats_inc(&pTcon->num_t2renames);
2006 if (rc) 2042 if (rc)
2007 cFYI(1, ("Send error in Rename (by file handle) = %d", rc)); 2043 cFYI(1, "Send error in Rename (by file handle) = %d", rc);
2008 2044
2009 cifs_buf_release(pSMB); 2045 cifs_buf_release(pSMB);
2010 2046
@@ -2026,7 +2062,7 @@ CIFSSMBCopy(const int xid, struct cifsTconInfo *tcon, const char *fromName,
2026 int name_len, name_len2; 2062 int name_len, name_len2;
2027 __u16 count; 2063 __u16 count;
2028 2064
2029 cFYI(1, ("In CIFSSMBCopy")); 2065 cFYI(1, "In CIFSSMBCopy");
2030copyRetry: 2066copyRetry:
2031 rc = smb_init(SMB_COM_COPY, 1, tcon, (void **) &pSMB, 2067 rc = smb_init(SMB_COM_COPY, 1, tcon, (void **) &pSMB,
2032 (void **) &pSMBr); 2068 (void **) &pSMBr);
@@ -2071,8 +2107,8 @@ copyRetry:
2071 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 2107 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
2072 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 2108 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
2073 if (rc) { 2109 if (rc) {
2074 cFYI(1, ("Send error in copy = %d with %d files copied", 2110 cFYI(1, "Send error in copy = %d with %d files copied",
2075 rc, le16_to_cpu(pSMBr->CopyCount))); 2111 rc, le16_to_cpu(pSMBr->CopyCount));
2076 } 2112 }
2077 cifs_buf_release(pSMB); 2113 cifs_buf_release(pSMB);
2078 2114
@@ -2096,7 +2132,7 @@ CIFSUnixCreateSymLink(const int xid, struct cifsTconInfo *tcon,
2096 int bytes_returned = 0; 2132 int bytes_returned = 0;
2097 __u16 params, param_offset, offset, byte_count; 2133 __u16 params, param_offset, offset, byte_count;
2098 2134
2099 cFYI(1, ("In Symlink Unix style")); 2135 cFYI(1, "In Symlink Unix style");
2100createSymLinkRetry: 2136createSymLinkRetry:
2101 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 2137 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
2102 (void **) &pSMBr); 2138 (void **) &pSMBr);
@@ -2161,7 +2197,7 @@ createSymLinkRetry:
2161 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 2197 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
2162 cifs_stats_inc(&tcon->num_symlinks); 2198 cifs_stats_inc(&tcon->num_symlinks);
2163 if (rc) 2199 if (rc)
2164 cFYI(1, ("Send error in SetPathInfo create symlink = %d", rc)); 2200 cFYI(1, "Send error in SetPathInfo create symlink = %d", rc);
2165 2201
2166 cifs_buf_release(pSMB); 2202 cifs_buf_release(pSMB);
2167 2203
@@ -2185,7 +2221,7 @@ CIFSUnixCreateHardLink(const int xid, struct cifsTconInfo *tcon,
2185 int bytes_returned = 0; 2221 int bytes_returned = 0;
2186 __u16 params, param_offset, offset, byte_count; 2222 __u16 params, param_offset, offset, byte_count;
2187 2223
2188 cFYI(1, ("In Create Hard link Unix style")); 2224 cFYI(1, "In Create Hard link Unix style");
2189createHardLinkRetry: 2225createHardLinkRetry:
2190 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 2226 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
2191 (void **) &pSMBr); 2227 (void **) &pSMBr);
@@ -2247,7 +2283,7 @@ createHardLinkRetry:
2247 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 2283 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
2248 cifs_stats_inc(&tcon->num_hardlinks); 2284 cifs_stats_inc(&tcon->num_hardlinks);
2249 if (rc) 2285 if (rc)
2250 cFYI(1, ("Send error in SetPathInfo (hard link) = %d", rc)); 2286 cFYI(1, "Send error in SetPathInfo (hard link) = %d", rc);
2251 2287
2252 cifs_buf_release(pSMB); 2288 cifs_buf_release(pSMB);
2253 if (rc == -EAGAIN) 2289 if (rc == -EAGAIN)
@@ -2268,7 +2304,7 @@ CIFSCreateHardLink(const int xid, struct cifsTconInfo *tcon,
2268 int name_len, name_len2; 2304 int name_len, name_len2;
2269 __u16 count; 2305 __u16 count;
2270 2306
2271 cFYI(1, ("In CIFSCreateHardLink")); 2307 cFYI(1, "In CIFSCreateHardLink");
2272winCreateHardLinkRetry: 2308winCreateHardLinkRetry:
2273 2309
2274 rc = smb_init(SMB_COM_NT_RENAME, 4, tcon, (void **) &pSMB, 2310 rc = smb_init(SMB_COM_NT_RENAME, 4, tcon, (void **) &pSMB,
@@ -2319,7 +2355,7 @@ winCreateHardLinkRetry:
2319 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 2355 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
2320 cifs_stats_inc(&tcon->num_hardlinks); 2356 cifs_stats_inc(&tcon->num_hardlinks);
2321 if (rc) 2357 if (rc)
2322 cFYI(1, ("Send error in hard link (NT rename) = %d", rc)); 2358 cFYI(1, "Send error in hard link (NT rename) = %d", rc);
2323 2359
2324 cifs_buf_release(pSMB); 2360 cifs_buf_release(pSMB);
2325 if (rc == -EAGAIN) 2361 if (rc == -EAGAIN)
@@ -2342,7 +2378,7 @@ CIFSSMBUnixQuerySymLink(const int xid, struct cifsTconInfo *tcon,
2342 __u16 params, byte_count; 2378 __u16 params, byte_count;
2343 char *data_start; 2379 char *data_start;
2344 2380
2345 cFYI(1, ("In QPathSymLinkInfo (Unix) for path %s", searchName)); 2381 cFYI(1, "In QPathSymLinkInfo (Unix) for path %s", searchName);
2346 2382
2347querySymLinkRetry: 2383querySymLinkRetry:
2348 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 2384 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
@@ -2389,7 +2425,7 @@ querySymLinkRetry:
2389 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 2425 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
2390 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 2426 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
2391 if (rc) { 2427 if (rc) {
2392 cFYI(1, ("Send error in QuerySymLinkInfo = %d", rc)); 2428 cFYI(1, "Send error in QuerySymLinkInfo = %d", rc);
2393 } else { 2429 } else {
2394 /* decode response */ 2430 /* decode response */
2395 2431
@@ -2490,21 +2526,21 @@ validate_ntransact(char *buf, char **ppparm, char **ppdata,
2490 2526
2491 /* should we also check that parm and data areas do not overlap? */ 2527 /* should we also check that parm and data areas do not overlap? */
2492 if (*ppparm > end_of_smb) { 2528 if (*ppparm > end_of_smb) {
2493 cFYI(1, ("parms start after end of smb")); 2529 cFYI(1, "parms start after end of smb");
2494 return -EINVAL; 2530 return -EINVAL;
2495 } else if (parm_count + *ppparm > end_of_smb) { 2531 } else if (parm_count + *ppparm > end_of_smb) {
2496 cFYI(1, ("parm end after end of smb")); 2532 cFYI(1, "parm end after end of smb");
2497 return -EINVAL; 2533 return -EINVAL;
2498 } else if (*ppdata > end_of_smb) { 2534 } else if (*ppdata > end_of_smb) {
2499 cFYI(1, ("data starts after end of smb")); 2535 cFYI(1, "data starts after end of smb");
2500 return -EINVAL; 2536 return -EINVAL;
2501 } else if (data_count + *ppdata > end_of_smb) { 2537 } else if (data_count + *ppdata > end_of_smb) {
2502 cFYI(1, ("data %p + count %d (%p) ends after end of smb %p start %p", 2538 cFYI(1, "data %p + count %d (%p) past smb end %p start %p",
2503 *ppdata, data_count, (data_count + *ppdata), 2539 *ppdata, data_count, (data_count + *ppdata),
2504 end_of_smb, pSMBr)); 2540 end_of_smb, pSMBr);
2505 return -EINVAL; 2541 return -EINVAL;
2506 } else if (parm_count + data_count > pSMBr->ByteCount) { 2542 } else if (parm_count + data_count > pSMBr->ByteCount) {
2507 cFYI(1, ("parm count and data count larger than SMB")); 2543 cFYI(1, "parm count and data count larger than SMB");
2508 return -EINVAL; 2544 return -EINVAL;
2509 } 2545 }
2510 *pdatalen = data_count; 2546 *pdatalen = data_count;
@@ -2523,7 +2559,7 @@ CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
2523 struct smb_com_transaction_ioctl_req *pSMB; 2559 struct smb_com_transaction_ioctl_req *pSMB;
2524 struct smb_com_transaction_ioctl_rsp *pSMBr; 2560 struct smb_com_transaction_ioctl_rsp *pSMBr;
2525 2561
2526 cFYI(1, ("In Windows reparse style QueryLink for path %s", searchName)); 2562 cFYI(1, "In Windows reparse style QueryLink for path %s", searchName);
2527 rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB, 2563 rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB,
2528 (void **) &pSMBr); 2564 (void **) &pSMBr);
2529 if (rc) 2565 if (rc)
@@ -2552,7 +2588,7 @@ CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
2552 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 2588 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
2553 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 2589 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
2554 if (rc) { 2590 if (rc) {
2555 cFYI(1, ("Send error in QueryReparseLinkInfo = %d", rc)); 2591 cFYI(1, "Send error in QueryReparseLinkInfo = %d", rc);
2556 } else { /* decode response */ 2592 } else { /* decode response */
2557 __u32 data_offset = le32_to_cpu(pSMBr->DataOffset); 2593 __u32 data_offset = le32_to_cpu(pSMBr->DataOffset);
2558 __u32 data_count = le32_to_cpu(pSMBr->DataCount); 2594 __u32 data_count = le32_to_cpu(pSMBr->DataCount);
@@ -2576,7 +2612,7 @@ CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
2576 if ((reparse_buf->LinkNamesBuf + 2612 if ((reparse_buf->LinkNamesBuf +
2577 reparse_buf->TargetNameOffset + 2613 reparse_buf->TargetNameOffset +
2578 reparse_buf->TargetNameLen) > end_of_smb) { 2614 reparse_buf->TargetNameLen) > end_of_smb) {
2579 cFYI(1, ("reparse buf beyond SMB")); 2615 cFYI(1, "reparse buf beyond SMB");
2580 rc = -EIO; 2616 rc = -EIO;
2581 goto qreparse_out; 2617 goto qreparse_out;
2582 } 2618 }
@@ -2597,12 +2633,12 @@ CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
2597 } 2633 }
2598 } else { 2634 } else {
2599 rc = -EIO; 2635 rc = -EIO;
2600 cFYI(1, ("Invalid return data count on " 2636 cFYI(1, "Invalid return data count on "
2601 "get reparse info ioctl")); 2637 "get reparse info ioctl");
2602 } 2638 }
2603 symlinkinfo[buflen] = 0; /* just in case so the caller 2639 symlinkinfo[buflen] = 0; /* just in case so the caller
2604 does not go off the end of the buffer */ 2640 does not go off the end of the buffer */
2605 cFYI(1, ("readlink result - %s", symlinkinfo)); 2641 cFYI(1, "readlink result - %s", symlinkinfo);
2606 } 2642 }
2607 2643
2608qreparse_out: 2644qreparse_out:
@@ -2625,7 +2661,7 @@ static void cifs_convert_ace(posix_acl_xattr_entry *ace,
2625 ace->e_perm = cpu_to_le16(cifs_ace->cifs_e_perm); 2661 ace->e_perm = cpu_to_le16(cifs_ace->cifs_e_perm);
2626 ace->e_tag = cpu_to_le16(cifs_ace->cifs_e_tag); 2662 ace->e_tag = cpu_to_le16(cifs_ace->cifs_e_tag);
2627 ace->e_id = cpu_to_le32(le64_to_cpu(cifs_ace->cifs_uid)); 2663 ace->e_id = cpu_to_le32(le64_to_cpu(cifs_ace->cifs_uid));
2628 /* cFYI(1,("perm %d tag %d id %d",ace->e_perm,ace->e_tag,ace->e_id)); */ 2664 /* cFYI(1, "perm %d tag %d id %d",ace->e_perm,ace->e_tag,ace->e_id); */
2629 2665
2630 return; 2666 return;
2631} 2667}
@@ -2651,8 +2687,8 @@ static int cifs_copy_posix_acl(char *trgt, char *src, const int buflen,
2651 size += sizeof(struct cifs_posix_ace) * count; 2687 size += sizeof(struct cifs_posix_ace) * count;
2652 /* check if we would go beyond end of SMB */ 2688 /* check if we would go beyond end of SMB */
2653 if (size_of_data_area < size) { 2689 if (size_of_data_area < size) {
2654 cFYI(1, ("bad CIFS POSIX ACL size %d vs. %d", 2690 cFYI(1, "bad CIFS POSIX ACL size %d vs. %d",
2655 size_of_data_area, size)); 2691 size_of_data_area, size);
2656 return -EINVAL; 2692 return -EINVAL;
2657 } 2693 }
2658 } else if (acl_type & ACL_TYPE_DEFAULT) { 2694 } else if (acl_type & ACL_TYPE_DEFAULT) {
@@ -2699,7 +2735,7 @@ static __u16 convert_ace_to_cifs_ace(struct cifs_posix_ace *cifs_ace,
2699 cifs_ace->cifs_uid = cpu_to_le64(-1); 2735 cifs_ace->cifs_uid = cpu_to_le64(-1);
2700 } else 2736 } else
2701 cifs_ace->cifs_uid = cpu_to_le64(le32_to_cpu(local_ace->e_id)); 2737 cifs_ace->cifs_uid = cpu_to_le64(le32_to_cpu(local_ace->e_id));
2702 /*cFYI(1,("perm %d tag %d id %d",ace->e_perm,ace->e_tag,ace->e_id));*/ 2738 /*cFYI(1, "perm %d tag %d id %d",ace->e_perm,ace->e_tag,ace->e_id);*/
2703 return rc; 2739 return rc;
2704} 2740}
2705 2741
@@ -2717,12 +2753,12 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL,
2717 return 0; 2753 return 0;
2718 2754
2719 count = posix_acl_xattr_count((size_t)buflen); 2755 count = posix_acl_xattr_count((size_t)buflen);
2720 cFYI(1, ("setting acl with %d entries from buf of length %d and " 2756 cFYI(1, "setting acl with %d entries from buf of length %d and "
2721 "version of %d", 2757 "version of %d",
2722 count, buflen, le32_to_cpu(local_acl->a_version))); 2758 count, buflen, le32_to_cpu(local_acl->a_version));
2723 if (le32_to_cpu(local_acl->a_version) != 2) { 2759 if (le32_to_cpu(local_acl->a_version) != 2) {
2724 cFYI(1, ("unknown POSIX ACL version %d", 2760 cFYI(1, "unknown POSIX ACL version %d",
2725 le32_to_cpu(local_acl->a_version))); 2761 le32_to_cpu(local_acl->a_version));
2726 return 0; 2762 return 0;
2727 } 2763 }
2728 cifs_acl->version = cpu_to_le16(1); 2764 cifs_acl->version = cpu_to_le16(1);
@@ -2731,7 +2767,7 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL,
2731 else if (acl_type == ACL_TYPE_DEFAULT) 2767 else if (acl_type == ACL_TYPE_DEFAULT)
2732 cifs_acl->default_entry_count = cpu_to_le16(count); 2768 cifs_acl->default_entry_count = cpu_to_le16(count);
2733 else { 2769 else {
2734 cFYI(1, ("unknown ACL type %d", acl_type)); 2770 cFYI(1, "unknown ACL type %d", acl_type);
2735 return 0; 2771 return 0;
2736 } 2772 }
2737 for (i = 0; i < count; i++) { 2773 for (i = 0; i < count; i++) {
@@ -2764,7 +2800,7 @@ CIFSSMBGetPosixACL(const int xid, struct cifsTconInfo *tcon,
2764 int name_len; 2800 int name_len;
2765 __u16 params, byte_count; 2801 __u16 params, byte_count;
2766 2802
2767 cFYI(1, ("In GetPosixACL (Unix) for path %s", searchName)); 2803 cFYI(1, "In GetPosixACL (Unix) for path %s", searchName);
2768 2804
2769queryAclRetry: 2805queryAclRetry:
2770 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 2806 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
@@ -2816,7 +2852,7 @@ queryAclRetry:
2816 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 2852 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
2817 cifs_stats_inc(&tcon->num_acl_get); 2853 cifs_stats_inc(&tcon->num_acl_get);
2818 if (rc) { 2854 if (rc) {
2819 cFYI(1, ("Send error in Query POSIX ACL = %d", rc)); 2855 cFYI(1, "Send error in Query POSIX ACL = %d", rc);
2820 } else { 2856 } else {
2821 /* decode response */ 2857 /* decode response */
2822 2858
@@ -2853,7 +2889,7 @@ CIFSSMBSetPosixACL(const int xid, struct cifsTconInfo *tcon,
2853 int bytes_returned = 0; 2889 int bytes_returned = 0;
2854 __u16 params, byte_count, data_count, param_offset, offset; 2890 __u16 params, byte_count, data_count, param_offset, offset;
2855 2891
2856 cFYI(1, ("In SetPosixACL (Unix) for path %s", fileName)); 2892 cFYI(1, "In SetPosixACL (Unix) for path %s", fileName);
2857setAclRetry: 2893setAclRetry:
2858 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 2894 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
2859 (void **) &pSMBr); 2895 (void **) &pSMBr);
@@ -2908,7 +2944,7 @@ setAclRetry:
2908 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 2944 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
2909 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 2945 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
2910 if (rc) 2946 if (rc)
2911 cFYI(1, ("Set POSIX ACL returned %d", rc)); 2947 cFYI(1, "Set POSIX ACL returned %d", rc);
2912 2948
2913setACLerrorExit: 2949setACLerrorExit:
2914 cifs_buf_release(pSMB); 2950 cifs_buf_release(pSMB);
@@ -2928,7 +2964,7 @@ CIFSGetExtAttr(const int xid, struct cifsTconInfo *tcon,
2928 int bytes_returned; 2964 int bytes_returned;
2929 __u16 params, byte_count; 2965 __u16 params, byte_count;
2930 2966
2931 cFYI(1, ("In GetExtAttr")); 2967 cFYI(1, "In GetExtAttr");
2932 if (tcon == NULL) 2968 if (tcon == NULL)
2933 return -ENODEV; 2969 return -ENODEV;
2934 2970
@@ -2967,7 +3003,7 @@ GetExtAttrRetry:
2967 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 3003 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
2968 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 3004 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
2969 if (rc) { 3005 if (rc) {
2970 cFYI(1, ("error %d in GetExtAttr", rc)); 3006 cFYI(1, "error %d in GetExtAttr", rc);
2971 } else { 3007 } else {
2972 /* decode response */ 3008 /* decode response */
2973 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 3009 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -2982,7 +3018,7 @@ GetExtAttrRetry:
2982 struct file_chattr_info *pfinfo; 3018 struct file_chattr_info *pfinfo;
2983 /* BB Do we need a cast or hash here ? */ 3019 /* BB Do we need a cast or hash here ? */
2984 if (count != 16) { 3020 if (count != 16) {
2985 cFYI(1, ("Illegal size ret in GetExtAttr")); 3021 cFYI(1, "Illegal size ret in GetExtAttr");
2986 rc = -EIO; 3022 rc = -EIO;
2987 goto GetExtAttrOut; 3023 goto GetExtAttrOut;
2988 } 3024 }
@@ -3012,7 +3048,7 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
3012 QUERY_SEC_DESC_REQ *pSMB; 3048 QUERY_SEC_DESC_REQ *pSMB;
3013 struct kvec iov[1]; 3049 struct kvec iov[1];
3014 3050
3015 cFYI(1, ("GetCifsACL")); 3051 cFYI(1, "GetCifsACL");
3016 3052
3017 *pbuflen = 0; 3053 *pbuflen = 0;
3018 *acl_inf = NULL; 3054 *acl_inf = NULL;
@@ -3037,7 +3073,7 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
3037 CIFS_STD_OP); 3073 CIFS_STD_OP);
3038 cifs_stats_inc(&tcon->num_acl_get); 3074 cifs_stats_inc(&tcon->num_acl_get);
3039 if (rc) { 3075 if (rc) {
3040 cFYI(1, ("Send error in QuerySecDesc = %d", rc)); 3076 cFYI(1, "Send error in QuerySecDesc = %d", rc);
3041 } else { /* decode response */ 3077 } else { /* decode response */
3042 __le32 *parm; 3078 __le32 *parm;
3043 __u32 parm_len; 3079 __u32 parm_len;
@@ -3052,7 +3088,7 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
3052 goto qsec_out; 3088 goto qsec_out;
3053 pSMBr = (struct smb_com_ntransact_rsp *)iov[0].iov_base; 3089 pSMBr = (struct smb_com_ntransact_rsp *)iov[0].iov_base;
3054 3090
3055 cFYI(1, ("smb %p parm %p data %p", pSMBr, parm, *acl_inf)); 3091 cFYI(1, "smb %p parm %p data %p", pSMBr, parm, *acl_inf);
3056 3092
3057 if (le32_to_cpu(pSMBr->ParameterCount) != 4) { 3093 if (le32_to_cpu(pSMBr->ParameterCount) != 4) {
3058 rc = -EIO; /* bad smb */ 3094 rc = -EIO; /* bad smb */
@@ -3064,8 +3100,8 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
3064 3100
3065 acl_len = le32_to_cpu(*parm); 3101 acl_len = le32_to_cpu(*parm);
3066 if (acl_len != *pbuflen) { 3102 if (acl_len != *pbuflen) {
3067 cERROR(1, ("acl length %d does not match %d", 3103 cERROR(1, "acl length %d does not match %d",
3068 acl_len, *pbuflen)); 3104 acl_len, *pbuflen);
3069 if (*pbuflen > acl_len) 3105 if (*pbuflen > acl_len)
3070 *pbuflen = acl_len; 3106 *pbuflen = acl_len;
3071 } 3107 }
@@ -3074,7 +3110,7 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
3074 header followed by the smallest SID */ 3110 header followed by the smallest SID */
3075 if ((*pbuflen < sizeof(struct cifs_ntsd) + 8) || 3111 if ((*pbuflen < sizeof(struct cifs_ntsd) + 8) ||
3076 (*pbuflen >= 64 * 1024)) { 3112 (*pbuflen >= 64 * 1024)) {
3077 cERROR(1, ("bad acl length %d", *pbuflen)); 3113 cERROR(1, "bad acl length %d", *pbuflen);
3078 rc = -EINVAL; 3114 rc = -EINVAL;
3079 *pbuflen = 0; 3115 *pbuflen = 0;
3080 } else { 3116 } else {
@@ -3148,9 +3184,9 @@ setCifsAclRetry:
3148 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 3184 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
3149 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 3185 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
3150 3186
3151 cFYI(1, ("SetCIFSACL bytes_returned: %d, rc: %d", bytes_returned, rc)); 3187 cFYI(1, "SetCIFSACL bytes_returned: %d, rc: %d", bytes_returned, rc);
3152 if (rc) 3188 if (rc)
3153 cFYI(1, ("Set CIFS ACL returned %d", rc)); 3189 cFYI(1, "Set CIFS ACL returned %d", rc);
3154 cifs_buf_release(pSMB); 3190 cifs_buf_release(pSMB);
3155 3191
3156 if (rc == -EAGAIN) 3192 if (rc == -EAGAIN)
@@ -3174,7 +3210,7 @@ int SMBQueryInformation(const int xid, struct cifsTconInfo *tcon,
3174 int bytes_returned; 3210 int bytes_returned;
3175 int name_len; 3211 int name_len;
3176 3212
3177 cFYI(1, ("In SMBQPath path %s", searchName)); 3213 cFYI(1, "In SMBQPath path %s", searchName);
3178QInfRetry: 3214QInfRetry:
3179 rc = smb_init(SMB_COM_QUERY_INFORMATION, 0, tcon, (void **) &pSMB, 3215 rc = smb_init(SMB_COM_QUERY_INFORMATION, 0, tcon, (void **) &pSMB,
3180 (void **) &pSMBr); 3216 (void **) &pSMBr);
@@ -3200,7 +3236,7 @@ QInfRetry:
3200 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 3236 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
3201 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 3237 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
3202 if (rc) { 3238 if (rc) {
3203 cFYI(1, ("Send error in QueryInfo = %d", rc)); 3239 cFYI(1, "Send error in QueryInfo = %d", rc);
3204 } else if (pFinfo) { 3240 } else if (pFinfo) {
3205 struct timespec ts; 3241 struct timespec ts;
3206 __u32 time = le32_to_cpu(pSMBr->last_write_time); 3242 __u32 time = le32_to_cpu(pSMBr->last_write_time);
@@ -3274,7 +3310,7 @@ QFileInfoRetry:
3274 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 3310 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
3275 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 3311 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
3276 if (rc) { 3312 if (rc) {
3277 cFYI(1, ("Send error in QPathInfo = %d", rc)); 3313 cFYI(1, "Send error in QPathInfo = %d", rc);
3278 } else { /* decode response */ 3314 } else { /* decode response */
3279 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 3315 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
3280 3316
@@ -3312,7 +3348,7 @@ CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon,
3312 int name_len; 3348 int name_len;
3313 __u16 params, byte_count; 3349 __u16 params, byte_count;
3314 3350
3315/* cFYI(1, ("In QPathInfo path %s", searchName)); */ 3351/* cFYI(1, "In QPathInfo path %s", searchName); */
3316QPathInfoRetry: 3352QPathInfoRetry:
3317 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 3353 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
3318 (void **) &pSMBr); 3354 (void **) &pSMBr);
@@ -3362,7 +3398,7 @@ QPathInfoRetry:
3362 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 3398 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
3363 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 3399 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
3364 if (rc) { 3400 if (rc) {
3365 cFYI(1, ("Send error in QPathInfo = %d", rc)); 3401 cFYI(1, "Send error in QPathInfo = %d", rc);
3366 } else { /* decode response */ 3402 } else { /* decode response */
3367 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 3403 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
3368 3404
@@ -3442,14 +3478,14 @@ UnixQFileInfoRetry:
3442 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 3478 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
3443 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 3479 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
3444 if (rc) { 3480 if (rc) {
3445 cFYI(1, ("Send error in QPathInfo = %d", rc)); 3481 cFYI(1, "Send error in QPathInfo = %d", rc);
3446 } else { /* decode response */ 3482 } else { /* decode response */
3447 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 3483 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
3448 3484
3449 if (rc || (pSMBr->ByteCount < sizeof(FILE_UNIX_BASIC_INFO))) { 3485 if (rc || (pSMBr->ByteCount < sizeof(FILE_UNIX_BASIC_INFO))) {
3450 cERROR(1, ("Malformed FILE_UNIX_BASIC_INFO response.\n" 3486 cERROR(1, "Malformed FILE_UNIX_BASIC_INFO response.\n"
3451 "Unix Extensions can be disabled on mount " 3487 "Unix Extensions can be disabled on mount "
3452 "by specifying the nosfu mount option.")); 3488 "by specifying the nosfu mount option.");
3453 rc = -EIO; /* bad smb */ 3489 rc = -EIO; /* bad smb */
3454 } else { 3490 } else {
3455 __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset); 3491 __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
@@ -3481,7 +3517,7 @@ CIFSSMBUnixQPathInfo(const int xid, struct cifsTconInfo *tcon,
3481 int name_len; 3517 int name_len;
3482 __u16 params, byte_count; 3518 __u16 params, byte_count;
3483 3519
3484 cFYI(1, ("In QPathInfo (Unix) the path %s", searchName)); 3520 cFYI(1, "In QPathInfo (Unix) the path %s", searchName);
3485UnixQPathInfoRetry: 3521UnixQPathInfoRetry:
3486 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 3522 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
3487 (void **) &pSMBr); 3523 (void **) &pSMBr);
@@ -3528,14 +3564,14 @@ UnixQPathInfoRetry:
3528 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 3564 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
3529 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 3565 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
3530 if (rc) { 3566 if (rc) {
3531 cFYI(1, ("Send error in QPathInfo = %d", rc)); 3567 cFYI(1, "Send error in QPathInfo = %d", rc);
3532 } else { /* decode response */ 3568 } else { /* decode response */
3533 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 3569 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
3534 3570
3535 if (rc || (pSMBr->ByteCount < sizeof(FILE_UNIX_BASIC_INFO))) { 3571 if (rc || (pSMBr->ByteCount < sizeof(FILE_UNIX_BASIC_INFO))) {
3536 cERROR(1, ("Malformed FILE_UNIX_BASIC_INFO response.\n" 3572 cERROR(1, "Malformed FILE_UNIX_BASIC_INFO response.\n"
3537 "Unix Extensions can be disabled on mount " 3573 "Unix Extensions can be disabled on mount "
3538 "by specifying the nosfu mount option.")); 3574 "by specifying the nosfu mount option.");
3539 rc = -EIO; /* bad smb */ 3575 rc = -EIO; /* bad smb */
3540 } else { 3576 } else {
3541 __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset); 3577 __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
@@ -3569,7 +3605,7 @@ CIFSFindFirst(const int xid, struct cifsTconInfo *tcon,
3569 int name_len; 3605 int name_len;
3570 __u16 params, byte_count; 3606 __u16 params, byte_count;
3571 3607
3572 cFYI(1, ("In FindFirst for %s", searchName)); 3608 cFYI(1, "In FindFirst for %s", searchName);
3573 3609
3574findFirstRetry: 3610findFirstRetry:
3575 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 3611 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
@@ -3646,7 +3682,7 @@ findFirstRetry:
3646 if (rc) {/* BB add logic to retry regular search if Unix search 3682 if (rc) {/* BB add logic to retry regular search if Unix search
3647 rejected unexpectedly by server */ 3683 rejected unexpectedly by server */
3648 /* BB Add code to handle unsupported level rc */ 3684 /* BB Add code to handle unsupported level rc */
3649 cFYI(1, ("Error in FindFirst = %d", rc)); 3685 cFYI(1, "Error in FindFirst = %d", rc);
3650 3686
3651 cifs_buf_release(pSMB); 3687 cifs_buf_release(pSMB);
3652 3688
@@ -3685,7 +3721,7 @@ findFirstRetry:
3685 lnoff = le16_to_cpu(parms->LastNameOffset); 3721 lnoff = le16_to_cpu(parms->LastNameOffset);
3686 if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE < 3722 if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE <
3687 lnoff) { 3723 lnoff) {
3688 cERROR(1, ("ignoring corrupt resume name")); 3724 cERROR(1, "ignoring corrupt resume name");
3689 psrch_inf->last_entry = NULL; 3725 psrch_inf->last_entry = NULL;
3690 return rc; 3726 return rc;
3691 } 3727 }
@@ -3713,7 +3749,7 @@ int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
3713 int bytes_returned, name_len; 3749 int bytes_returned, name_len;
3714 __u16 params, byte_count; 3750 __u16 params, byte_count;
3715 3751
3716 cFYI(1, ("In FindNext")); 3752 cFYI(1, "In FindNext");
3717 3753
3718 if (psrch_inf->endOfSearch) 3754 if (psrch_inf->endOfSearch)
3719 return -ENOENT; 3755 return -ENOENT;
@@ -3777,7 +3813,7 @@ int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
3777 cifs_buf_release(pSMB); 3813 cifs_buf_release(pSMB);
3778 rc = 0; /* search probably was closed at end of search*/ 3814 rc = 0; /* search probably was closed at end of search*/
3779 } else 3815 } else
3780 cFYI(1, ("FindNext returned = %d", rc)); 3816 cFYI(1, "FindNext returned = %d", rc);
3781 } else { /* decode response */ 3817 } else { /* decode response */
3782 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 3818 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
3783 3819
@@ -3813,15 +3849,15 @@ int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
3813 lnoff = le16_to_cpu(parms->LastNameOffset); 3849 lnoff = le16_to_cpu(parms->LastNameOffset);
3814 if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE < 3850 if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE <
3815 lnoff) { 3851 lnoff) {
3816 cERROR(1, ("ignoring corrupt resume name")); 3852 cERROR(1, "ignoring corrupt resume name");
3817 psrch_inf->last_entry = NULL; 3853 psrch_inf->last_entry = NULL;
3818 return rc; 3854 return rc;
3819 } else 3855 } else
3820 psrch_inf->last_entry = 3856 psrch_inf->last_entry =
3821 psrch_inf->srch_entries_start + lnoff; 3857 psrch_inf->srch_entries_start + lnoff;
3822 3858
3823/* cFYI(1,("fnxt2 entries in buf %d index_of_last %d", 3859/* cFYI(1, "fnxt2 entries in buf %d index_of_last %d",
3824 psrch_inf->entries_in_buffer, psrch_inf->index_of_last_entry)); */ 3860 psrch_inf->entries_in_buffer, psrch_inf->index_of_last_entry); */
3825 3861
3826 /* BB fixme add unlock here */ 3862 /* BB fixme add unlock here */
3827 } 3863 }
@@ -3846,7 +3882,7 @@ CIFSFindClose(const int xid, struct cifsTconInfo *tcon,
3846 int rc = 0; 3882 int rc = 0;
3847 FINDCLOSE_REQ *pSMB = NULL; 3883 FINDCLOSE_REQ *pSMB = NULL;
3848 3884
3849 cFYI(1, ("In CIFSSMBFindClose")); 3885 cFYI(1, "In CIFSSMBFindClose");
3850 rc = small_smb_init(SMB_COM_FIND_CLOSE2, 1, tcon, (void **)&pSMB); 3886 rc = small_smb_init(SMB_COM_FIND_CLOSE2, 1, tcon, (void **)&pSMB);
3851 3887
3852 /* no sense returning error if session restarted 3888 /* no sense returning error if session restarted
@@ -3860,7 +3896,7 @@ CIFSFindClose(const int xid, struct cifsTconInfo *tcon,
3860 pSMB->ByteCount = 0; 3896 pSMB->ByteCount = 0;
3861 rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0); 3897 rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
3862 if (rc) 3898 if (rc)
3863 cERROR(1, ("Send error in FindClose = %d", rc)); 3899 cERROR(1, "Send error in FindClose = %d", rc);
3864 3900
3865 cifs_stats_inc(&tcon->num_fclose); 3901 cifs_stats_inc(&tcon->num_fclose);
3866 3902
@@ -3883,7 +3919,7 @@ CIFSGetSrvInodeNumber(const int xid, struct cifsTconInfo *tcon,
3883 int name_len, bytes_returned; 3919 int name_len, bytes_returned;
3884 __u16 params, byte_count; 3920 __u16 params, byte_count;
3885 3921
3886 cFYI(1, ("In GetSrvInodeNum for %s", searchName)); 3922 cFYI(1, "In GetSrvInodeNum for %s", searchName);
3887 if (tcon == NULL) 3923 if (tcon == NULL)
3888 return -ENODEV; 3924 return -ENODEV;
3889 3925
@@ -3933,7 +3969,7 @@ GetInodeNumberRetry:
3933 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 3969 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
3934 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 3970 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
3935 if (rc) { 3971 if (rc) {
3936 cFYI(1, ("error %d in QueryInternalInfo", rc)); 3972 cFYI(1, "error %d in QueryInternalInfo", rc);
3937 } else { 3973 } else {
3938 /* decode response */ 3974 /* decode response */
3939 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 3975 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -3948,7 +3984,7 @@ GetInodeNumberRetry:
3948 struct file_internal_info *pfinfo; 3984 struct file_internal_info *pfinfo;
3949 /* BB Do we need a cast or hash here ? */ 3985 /* BB Do we need a cast or hash here ? */
3950 if (count < 8) { 3986 if (count < 8) {
3951 cFYI(1, ("Illegal size ret in QryIntrnlInf")); 3987 cFYI(1, "Illegal size ret in QryIntrnlInf");
3952 rc = -EIO; 3988 rc = -EIO;
3953 goto GetInodeNumOut; 3989 goto GetInodeNumOut;
3954 } 3990 }
@@ -3989,16 +4025,16 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
3989 *num_of_nodes = le16_to_cpu(pSMBr->NumberOfReferrals); 4025 *num_of_nodes = le16_to_cpu(pSMBr->NumberOfReferrals);
3990 4026
3991 if (*num_of_nodes < 1) { 4027 if (*num_of_nodes < 1) {
3992 cERROR(1, ("num_referrals: must be at least > 0," 4028 cERROR(1, "num_referrals: must be at least > 0,"
3993 "but we get num_referrals = %d\n", *num_of_nodes)); 4029 "but we get num_referrals = %d\n", *num_of_nodes);
3994 rc = -EINVAL; 4030 rc = -EINVAL;
3995 goto parse_DFS_referrals_exit; 4031 goto parse_DFS_referrals_exit;
3996 } 4032 }
3997 4033
3998 ref = (struct dfs_referral_level_3 *) &(pSMBr->referrals); 4034 ref = (struct dfs_referral_level_3 *) &(pSMBr->referrals);
3999 if (ref->VersionNumber != cpu_to_le16(3)) { 4035 if (ref->VersionNumber != cpu_to_le16(3)) {
4000 cERROR(1, ("Referrals of V%d version are not supported," 4036 cERROR(1, "Referrals of V%d version are not supported,"
4001 "should be V3", le16_to_cpu(ref->VersionNumber))); 4037 "should be V3", le16_to_cpu(ref->VersionNumber));
4002 rc = -EINVAL; 4038 rc = -EINVAL;
4003 goto parse_DFS_referrals_exit; 4039 goto parse_DFS_referrals_exit;
4004 } 4040 }
@@ -4007,14 +4043,14 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
4007 data_end = (char *)(&(pSMBr->PathConsumed)) + 4043 data_end = (char *)(&(pSMBr->PathConsumed)) +
4008 le16_to_cpu(pSMBr->t2.DataCount); 4044 le16_to_cpu(pSMBr->t2.DataCount);
4009 4045
4010 cFYI(1, ("num_referrals: %d dfs flags: 0x%x ... \n", 4046 cFYI(1, "num_referrals: %d dfs flags: 0x%x ...\n",
4011 *num_of_nodes, 4047 *num_of_nodes,
4012 le32_to_cpu(pSMBr->DFSFlags))); 4048 le32_to_cpu(pSMBr->DFSFlags));
4013 4049
4014 *target_nodes = kzalloc(sizeof(struct dfs_info3_param) * 4050 *target_nodes = kzalloc(sizeof(struct dfs_info3_param) *
4015 *num_of_nodes, GFP_KERNEL); 4051 *num_of_nodes, GFP_KERNEL);
4016 if (*target_nodes == NULL) { 4052 if (*target_nodes == NULL) {
4017 cERROR(1, ("Failed to allocate buffer for target_nodes\n")); 4053 cERROR(1, "Failed to allocate buffer for target_nodes\n");
4018 rc = -ENOMEM; 4054 rc = -ENOMEM;
4019 goto parse_DFS_referrals_exit; 4055 goto parse_DFS_referrals_exit;
4020 } 4056 }
@@ -4090,7 +4126,7 @@ CIFSGetDFSRefer(const int xid, struct cifsSesInfo *ses,
4090 *num_of_nodes = 0; 4126 *num_of_nodes = 0;
4091 *target_nodes = NULL; 4127 *target_nodes = NULL;
4092 4128
4093 cFYI(1, ("In GetDFSRefer the path %s", searchName)); 4129 cFYI(1, "In GetDFSRefer the path %s", searchName);
4094 if (ses == NULL) 4130 if (ses == NULL)
4095 return -ENODEV; 4131 return -ENODEV;
4096getDFSRetry: 4132getDFSRetry:
@@ -4157,7 +4193,7 @@ getDFSRetry:
4157 rc = SendReceive(xid, ses, (struct smb_hdr *) pSMB, 4193 rc = SendReceive(xid, ses, (struct smb_hdr *) pSMB,
4158 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 4194 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
4159 if (rc) { 4195 if (rc) {
4160 cFYI(1, ("Send error in GetDFSRefer = %d", rc)); 4196 cFYI(1, "Send error in GetDFSRefer = %d", rc);
4161 goto GetDFSRefExit; 4197 goto GetDFSRefExit;
4162 } 4198 }
4163 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 4199 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -4168,9 +4204,9 @@ getDFSRetry:
4168 goto GetDFSRefExit; 4204 goto GetDFSRefExit;
4169 } 4205 }
4170 4206
4171 cFYI(1, ("Decoding GetDFSRefer response BCC: %d Offset %d", 4207 cFYI(1, "Decoding GetDFSRefer response BCC: %d Offset %d",
4172 pSMBr->ByteCount, 4208 pSMBr->ByteCount,
4173 le16_to_cpu(pSMBr->t2.DataOffset))); 4209 le16_to_cpu(pSMBr->t2.DataOffset));
4174 4210
4175 /* parse returned result into more usable form */ 4211 /* parse returned result into more usable form */
4176 rc = parse_DFS_referrals(pSMBr, num_of_nodes, 4212 rc = parse_DFS_referrals(pSMBr, num_of_nodes,
@@ -4198,7 +4234,7 @@ SMBOldQFSInfo(const int xid, struct cifsTconInfo *tcon, struct kstatfs *FSData)
4198 int bytes_returned = 0; 4234 int bytes_returned = 0;
4199 __u16 params, byte_count; 4235 __u16 params, byte_count;
4200 4236
4201 cFYI(1, ("OldQFSInfo")); 4237 cFYI(1, "OldQFSInfo");
4202oldQFSInfoRetry: 4238oldQFSInfoRetry:
4203 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 4239 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
4204 (void **) &pSMBr); 4240 (void **) &pSMBr);
@@ -4231,7 +4267,7 @@ oldQFSInfoRetry:
4231 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 4267 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
4232 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 4268 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
4233 if (rc) { 4269 if (rc) {
4234 cFYI(1, ("Send error in QFSInfo = %d", rc)); 4270 cFYI(1, "Send error in QFSInfo = %d", rc);
4235 } else { /* decode response */ 4271 } else { /* decode response */
4236 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 4272 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
4237 4273
@@ -4239,8 +4275,8 @@ oldQFSInfoRetry:
4239 rc = -EIO; /* bad smb */ 4275 rc = -EIO; /* bad smb */
4240 else { 4276 else {
4241 __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset); 4277 __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
4242 cFYI(1, ("qfsinf resp BCC: %d Offset %d", 4278 cFYI(1, "qfsinf resp BCC: %d Offset %d",
4243 pSMBr->ByteCount, data_offset)); 4279 pSMBr->ByteCount, data_offset);
4244 4280
4245 response_data = (FILE_SYSTEM_ALLOC_INFO *) 4281 response_data = (FILE_SYSTEM_ALLOC_INFO *)
4246 (((char *) &pSMBr->hdr.Protocol) + data_offset); 4282 (((char *) &pSMBr->hdr.Protocol) + data_offset);
@@ -4252,11 +4288,10 @@ oldQFSInfoRetry:
4252 le32_to_cpu(response_data->TotalAllocationUnits); 4288 le32_to_cpu(response_data->TotalAllocationUnits);
4253 FSData->f_bfree = FSData->f_bavail = 4289 FSData->f_bfree = FSData->f_bavail =
4254 le32_to_cpu(response_data->FreeAllocationUnits); 4290 le32_to_cpu(response_data->FreeAllocationUnits);
4255 cFYI(1, 4291 cFYI(1, "Blocks: %lld Free: %lld Block size %ld",
4256 ("Blocks: %lld Free: %lld Block size %ld", 4292 (unsigned long long)FSData->f_blocks,
4257 (unsigned long long)FSData->f_blocks, 4293 (unsigned long long)FSData->f_bfree,
4258 (unsigned long long)FSData->f_bfree, 4294 FSData->f_bsize);
4259 FSData->f_bsize));
4260 } 4295 }
4261 } 4296 }
4262 cifs_buf_release(pSMB); 4297 cifs_buf_release(pSMB);
@@ -4278,7 +4313,7 @@ CIFSSMBQFSInfo(const int xid, struct cifsTconInfo *tcon, struct kstatfs *FSData)
4278 int bytes_returned = 0; 4313 int bytes_returned = 0;
4279 __u16 params, byte_count; 4314 __u16 params, byte_count;
4280 4315
4281 cFYI(1, ("In QFSInfo")); 4316 cFYI(1, "In QFSInfo");
4282QFSInfoRetry: 4317QFSInfoRetry:
4283 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 4318 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
4284 (void **) &pSMBr); 4319 (void **) &pSMBr);
@@ -4311,7 +4346,7 @@ QFSInfoRetry:
4311 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 4346 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
4312 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 4347 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
4313 if (rc) { 4348 if (rc) {
4314 cFYI(1, ("Send error in QFSInfo = %d", rc)); 4349 cFYI(1, "Send error in QFSInfo = %d", rc);
4315 } else { /* decode response */ 4350 } else { /* decode response */
4316 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 4351 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
4317 4352
@@ -4332,11 +4367,10 @@ QFSInfoRetry:
4332 le64_to_cpu(response_data->TotalAllocationUnits); 4367 le64_to_cpu(response_data->TotalAllocationUnits);
4333 FSData->f_bfree = FSData->f_bavail = 4368 FSData->f_bfree = FSData->f_bavail =
4334 le64_to_cpu(response_data->FreeAllocationUnits); 4369 le64_to_cpu(response_data->FreeAllocationUnits);
4335 cFYI(1, 4370 cFYI(1, "Blocks: %lld Free: %lld Block size %ld",
4336 ("Blocks: %lld Free: %lld Block size %ld", 4371 (unsigned long long)FSData->f_blocks,
4337 (unsigned long long)FSData->f_blocks, 4372 (unsigned long long)FSData->f_bfree,
4338 (unsigned long long)FSData->f_bfree, 4373 FSData->f_bsize);
4339 FSData->f_bsize));
4340 } 4374 }
4341 } 4375 }
4342 cifs_buf_release(pSMB); 4376 cifs_buf_release(pSMB);
@@ -4358,7 +4392,7 @@ CIFSSMBQFSAttributeInfo(const int xid, struct cifsTconInfo *tcon)
4358 int bytes_returned = 0; 4392 int bytes_returned = 0;
4359 __u16 params, byte_count; 4393 __u16 params, byte_count;
4360 4394
4361 cFYI(1, ("In QFSAttributeInfo")); 4395 cFYI(1, "In QFSAttributeInfo");
4362QFSAttributeRetry: 4396QFSAttributeRetry:
4363 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 4397 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
4364 (void **) &pSMBr); 4398 (void **) &pSMBr);
@@ -4392,7 +4426,7 @@ QFSAttributeRetry:
4392 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 4426 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
4393 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 4427 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
4394 if (rc) { 4428 if (rc) {
4395 cERROR(1, ("Send error in QFSAttributeInfo = %d", rc)); 4429 cERROR(1, "Send error in QFSAttributeInfo = %d", rc);
4396 } else { /* decode response */ 4430 } else { /* decode response */
4397 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 4431 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
4398 4432
@@ -4428,7 +4462,7 @@ CIFSSMBQFSDeviceInfo(const int xid, struct cifsTconInfo *tcon)
4428 int bytes_returned = 0; 4462 int bytes_returned = 0;
4429 __u16 params, byte_count; 4463 __u16 params, byte_count;
4430 4464
4431 cFYI(1, ("In QFSDeviceInfo")); 4465 cFYI(1, "In QFSDeviceInfo");
4432QFSDeviceRetry: 4466QFSDeviceRetry:
4433 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 4467 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
4434 (void **) &pSMBr); 4468 (void **) &pSMBr);
@@ -4463,7 +4497,7 @@ QFSDeviceRetry:
4463 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 4497 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
4464 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 4498 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
4465 if (rc) { 4499 if (rc) {
4466 cFYI(1, ("Send error in QFSDeviceInfo = %d", rc)); 4500 cFYI(1, "Send error in QFSDeviceInfo = %d", rc);
4467 } else { /* decode response */ 4501 } else { /* decode response */
4468 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 4502 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
4469 4503
@@ -4498,7 +4532,7 @@ CIFSSMBQFSUnixInfo(const int xid, struct cifsTconInfo *tcon)
4498 int bytes_returned = 0; 4532 int bytes_returned = 0;
4499 __u16 params, byte_count; 4533 __u16 params, byte_count;
4500 4534
4501 cFYI(1, ("In QFSUnixInfo")); 4535 cFYI(1, "In QFSUnixInfo");
4502QFSUnixRetry: 4536QFSUnixRetry:
4503 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 4537 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
4504 (void **) &pSMBr); 4538 (void **) &pSMBr);
@@ -4532,7 +4566,7 @@ QFSUnixRetry:
4532 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 4566 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
4533 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 4567 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
4534 if (rc) { 4568 if (rc) {
4535 cERROR(1, ("Send error in QFSUnixInfo = %d", rc)); 4569 cERROR(1, "Send error in QFSUnixInfo = %d", rc);
4536 } else { /* decode response */ 4570 } else { /* decode response */
4537 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 4571 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
4538 4572
@@ -4567,7 +4601,7 @@ CIFSSMBSetFSUnixInfo(const int xid, struct cifsTconInfo *tcon, __u64 cap)
4567 int bytes_returned = 0; 4601 int bytes_returned = 0;
4568 __u16 params, param_offset, offset, byte_count; 4602 __u16 params, param_offset, offset, byte_count;
4569 4603
4570 cFYI(1, ("In SETFSUnixInfo")); 4604 cFYI(1, "In SETFSUnixInfo");
4571SETFSUnixRetry: 4605SETFSUnixRetry:
4572 /* BB switch to small buf init to save memory */ 4606 /* BB switch to small buf init to save memory */
4573 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 4607 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
@@ -4615,7 +4649,7 @@ SETFSUnixRetry:
4615 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 4649 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
4616 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 4650 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
4617 if (rc) { 4651 if (rc) {
4618 cERROR(1, ("Send error in SETFSUnixInfo = %d", rc)); 4652 cERROR(1, "Send error in SETFSUnixInfo = %d", rc);
4619 } else { /* decode response */ 4653 } else { /* decode response */
4620 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 4654 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
4621 if (rc) 4655 if (rc)
@@ -4643,7 +4677,7 @@ CIFSSMBQFSPosixInfo(const int xid, struct cifsTconInfo *tcon,
4643 int bytes_returned = 0; 4677 int bytes_returned = 0;
4644 __u16 params, byte_count; 4678 __u16 params, byte_count;
4645 4679
4646 cFYI(1, ("In QFSPosixInfo")); 4680 cFYI(1, "In QFSPosixInfo");
4647QFSPosixRetry: 4681QFSPosixRetry:
4648 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 4682 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
4649 (void **) &pSMBr); 4683 (void **) &pSMBr);
@@ -4677,7 +4711,7 @@ QFSPosixRetry:
4677 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 4711 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
4678 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 4712 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
4679 if (rc) { 4713 if (rc) {
4680 cFYI(1, ("Send error in QFSUnixInfo = %d", rc)); 4714 cFYI(1, "Send error in QFSUnixInfo = %d", rc);
4681 } else { /* decode response */ 4715 } else { /* decode response */
4682 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 4716 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
4683 4717
@@ -4737,7 +4771,7 @@ CIFSSMBSetEOF(const int xid, struct cifsTconInfo *tcon, const char *fileName,
4737 int bytes_returned = 0; 4771 int bytes_returned = 0;
4738 __u16 params, byte_count, data_count, param_offset, offset; 4772 __u16 params, byte_count, data_count, param_offset, offset;
4739 4773
4740 cFYI(1, ("In SetEOF")); 4774 cFYI(1, "In SetEOF");
4741SetEOFRetry: 4775SetEOFRetry:
4742 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 4776 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
4743 (void **) &pSMBr); 4777 (void **) &pSMBr);
@@ -4803,7 +4837,7 @@ SetEOFRetry:
4803 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 4837 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
4804 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 4838 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
4805 if (rc) 4839 if (rc)
4806 cFYI(1, ("SetPathInfo (file size) returned %d", rc)); 4840 cFYI(1, "SetPathInfo (file size) returned %d", rc);
4807 4841
4808 cifs_buf_release(pSMB); 4842 cifs_buf_release(pSMB);
4809 4843
@@ -4823,8 +4857,8 @@ CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon, __u64 size,
4823 int rc = 0; 4857 int rc = 0;
4824 __u16 params, param_offset, offset, byte_count, count; 4858 __u16 params, param_offset, offset, byte_count, count;
4825 4859
4826 cFYI(1, ("SetFileSize (via SetFileInfo) %lld", 4860 cFYI(1, "SetFileSize (via SetFileInfo) %lld",
4827 (long long)size)); 4861 (long long)size);
4828 rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB); 4862 rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
4829 4863
4830 if (rc) 4864 if (rc)
@@ -4883,9 +4917,7 @@ CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon, __u64 size,
4883 pSMB->ByteCount = cpu_to_le16(byte_count); 4917 pSMB->ByteCount = cpu_to_le16(byte_count);
4884 rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0); 4918 rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
4885 if (rc) { 4919 if (rc) {
4886 cFYI(1, 4920 cFYI(1, "Send error in SetFileInfo (SetFileSize) = %d", rc);
4887 ("Send error in SetFileInfo (SetFileSize) = %d",
4888 rc));
4889 } 4921 }
4890 4922
4891 /* Note: On -EAGAIN error only caller can retry on handle based calls 4923 /* Note: On -EAGAIN error only caller can retry on handle based calls
@@ -4909,7 +4941,7 @@ CIFSSMBSetFileInfo(const int xid, struct cifsTconInfo *tcon,
4909 int rc = 0; 4941 int rc = 0;
4910 __u16 params, param_offset, offset, byte_count, count; 4942 __u16 params, param_offset, offset, byte_count, count;
4911 4943
4912 cFYI(1, ("Set Times (via SetFileInfo)")); 4944 cFYI(1, "Set Times (via SetFileInfo)");
4913 rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB); 4945 rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
4914 4946
4915 if (rc) 4947 if (rc)
@@ -4954,7 +4986,7 @@ CIFSSMBSetFileInfo(const int xid, struct cifsTconInfo *tcon,
4954 memcpy(data_offset, data, sizeof(FILE_BASIC_INFO)); 4986 memcpy(data_offset, data, sizeof(FILE_BASIC_INFO));
4955 rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0); 4987 rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
4956 if (rc) 4988 if (rc)
4957 cFYI(1, ("Send error in Set Time (SetFileInfo) = %d", rc)); 4989 cFYI(1, "Send error in Set Time (SetFileInfo) = %d", rc);
4958 4990
4959 /* Note: On -EAGAIN error only caller can retry on handle based calls 4991 /* Note: On -EAGAIN error only caller can retry on handle based calls
4960 since file handle passed in no longer valid */ 4992 since file handle passed in no longer valid */
@@ -4971,7 +5003,7 @@ CIFSSMBSetFileDisposition(const int xid, struct cifsTconInfo *tcon,
4971 int rc = 0; 5003 int rc = 0;
4972 __u16 params, param_offset, offset, byte_count, count; 5004 __u16 params, param_offset, offset, byte_count, count;
4973 5005
4974 cFYI(1, ("Set File Disposition (via SetFileInfo)")); 5006 cFYI(1, "Set File Disposition (via SetFileInfo)");
4975 rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB); 5007 rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
4976 5008
4977 if (rc) 5009 if (rc)
@@ -5013,7 +5045,7 @@ CIFSSMBSetFileDisposition(const int xid, struct cifsTconInfo *tcon,
5013 *data_offset = delete_file ? 1 : 0; 5045 *data_offset = delete_file ? 1 : 0;
5014 rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0); 5046 rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
5015 if (rc) 5047 if (rc)
5016 cFYI(1, ("Send error in SetFileDisposition = %d", rc)); 5048 cFYI(1, "Send error in SetFileDisposition = %d", rc);
5017 5049
5018 return rc; 5050 return rc;
5019} 5051}
@@ -5031,7 +5063,7 @@ CIFSSMBSetPathInfo(const int xid, struct cifsTconInfo *tcon,
5031 char *data_offset; 5063 char *data_offset;
5032 __u16 params, param_offset, offset, byte_count, count; 5064 __u16 params, param_offset, offset, byte_count, count;
5033 5065
5034 cFYI(1, ("In SetTimes")); 5066 cFYI(1, "In SetTimes");
5035 5067
5036SetTimesRetry: 5068SetTimesRetry:
5037 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 5069 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
@@ -5087,7 +5119,7 @@ SetTimesRetry:
5087 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 5119 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
5088 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 5120 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
5089 if (rc) 5121 if (rc)
5090 cFYI(1, ("SetPathInfo (times) returned %d", rc)); 5122 cFYI(1, "SetPathInfo (times) returned %d", rc);
5091 5123
5092 cifs_buf_release(pSMB); 5124 cifs_buf_release(pSMB);
5093 5125
@@ -5112,7 +5144,7 @@ CIFSSMBSetAttrLegacy(int xid, struct cifsTconInfo *tcon, char *fileName,
5112 int bytes_returned; 5144 int bytes_returned;
5113 int name_len; 5145 int name_len;
5114 5146
5115 cFYI(1, ("In SetAttrLegacy")); 5147 cFYI(1, "In SetAttrLegacy");
5116 5148
5117SetAttrLgcyRetry: 5149SetAttrLgcyRetry:
5118 rc = smb_init(SMB_COM_SETATTR, 8, tcon, (void **) &pSMB, 5150 rc = smb_init(SMB_COM_SETATTR, 8, tcon, (void **) &pSMB,
@@ -5138,7 +5170,7 @@ SetAttrLgcyRetry:
5138 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 5170 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
5139 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 5171 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
5140 if (rc) 5172 if (rc)
5141 cFYI(1, ("Error in LegacySetAttr = %d", rc)); 5173 cFYI(1, "Error in LegacySetAttr = %d", rc);
5142 5174
5143 cifs_buf_release(pSMB); 5175 cifs_buf_release(pSMB);
5144 5176
@@ -5200,7 +5232,7 @@ CIFSSMBUnixSetFileInfo(const int xid, struct cifsTconInfo *tcon,
5200 int rc = 0; 5232 int rc = 0;
5201 u16 params, param_offset, offset, byte_count, count; 5233 u16 params, param_offset, offset, byte_count, count;
5202 5234
5203 cFYI(1, ("Set Unix Info (via SetFileInfo)")); 5235 cFYI(1, "Set Unix Info (via SetFileInfo)");
5204 rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB); 5236 rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
5205 5237
5206 if (rc) 5238 if (rc)
@@ -5245,7 +5277,7 @@ CIFSSMBUnixSetFileInfo(const int xid, struct cifsTconInfo *tcon,
5245 5277
5246 rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0); 5278 rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
5247 if (rc) 5279 if (rc)
5248 cFYI(1, ("Send error in Set Time (SetFileInfo) = %d", rc)); 5280 cFYI(1, "Send error in Set Time (SetFileInfo) = %d", rc);
5249 5281
5250 /* Note: On -EAGAIN error only caller can retry on handle based calls 5282 /* Note: On -EAGAIN error only caller can retry on handle based calls
5251 since file handle passed in no longer valid */ 5283 since file handle passed in no longer valid */
@@ -5266,7 +5298,7 @@ CIFSSMBUnixSetPathInfo(const int xid, struct cifsTconInfo *tcon, char *fileName,
5266 FILE_UNIX_BASIC_INFO *data_offset; 5298 FILE_UNIX_BASIC_INFO *data_offset;
5267 __u16 params, param_offset, offset, count, byte_count; 5299 __u16 params, param_offset, offset, count, byte_count;
5268 5300
5269 cFYI(1, ("In SetUID/GID/Mode")); 5301 cFYI(1, "In SetUID/GID/Mode");
5270setPermsRetry: 5302setPermsRetry:
5271 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 5303 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
5272 (void **) &pSMBr); 5304 (void **) &pSMBr);
@@ -5322,7 +5354,7 @@ setPermsRetry:
5322 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 5354 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
5323 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 5355 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
5324 if (rc) 5356 if (rc)
5325 cFYI(1, ("SetPathInfo (perms) returned %d", rc)); 5357 cFYI(1, "SetPathInfo (perms) returned %d", rc);
5326 5358
5327 cifs_buf_release(pSMB); 5359 cifs_buf_release(pSMB);
5328 if (rc == -EAGAIN) 5360 if (rc == -EAGAIN)
@@ -5341,7 +5373,7 @@ int CIFSSMBNotify(const int xid, struct cifsTconInfo *tcon,
5341 struct dir_notify_req *dnotify_req; 5373 struct dir_notify_req *dnotify_req;
5342 int bytes_returned; 5374 int bytes_returned;
5343 5375
5344 cFYI(1, ("In CIFSSMBNotify for file handle %d", (int)netfid)); 5376 cFYI(1, "In CIFSSMBNotify for file handle %d", (int)netfid);
5345 rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB, 5377 rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB,
5346 (void **) &pSMBr); 5378 (void **) &pSMBr);
5347 if (rc) 5379 if (rc)
@@ -5375,7 +5407,7 @@ int CIFSSMBNotify(const int xid, struct cifsTconInfo *tcon,
5375 (struct smb_hdr *)pSMBr, &bytes_returned, 5407 (struct smb_hdr *)pSMBr, &bytes_returned,
5376 CIFS_ASYNC_OP); 5408 CIFS_ASYNC_OP);
5377 if (rc) { 5409 if (rc) {
5378 cFYI(1, ("Error in Notify = %d", rc)); 5410 cFYI(1, "Error in Notify = %d", rc);
5379 } else { 5411 } else {
5380 /* Add file to outstanding requests */ 5412 /* Add file to outstanding requests */
5381 /* BB change to kmem cache alloc */ 5413 /* BB change to kmem cache alloc */
@@ -5431,7 +5463,7 @@ CIFSSMBQAllEAs(const int xid, struct cifsTconInfo *tcon,
5431 char *end_of_smb; 5463 char *end_of_smb;
5432 __u16 params, byte_count, data_offset; 5464 __u16 params, byte_count, data_offset;
5433 5465
5434 cFYI(1, ("In Query All EAs path %s", searchName)); 5466 cFYI(1, "In Query All EAs path %s", searchName);
5435QAllEAsRetry: 5467QAllEAsRetry:
5436 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 5468 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
5437 (void **) &pSMBr); 5469 (void **) &pSMBr);
@@ -5478,7 +5510,7 @@ QAllEAsRetry:
5478 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 5510 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
5479 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 5511 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
5480 if (rc) { 5512 if (rc) {
5481 cFYI(1, ("Send error in QueryAllEAs = %d", rc)); 5513 cFYI(1, "Send error in QueryAllEAs = %d", rc);
5482 goto QAllEAsOut; 5514 goto QAllEAsOut;
5483 } 5515 }
5484 5516
@@ -5506,16 +5538,16 @@ QAllEAsRetry:
5506 (((char *) &pSMBr->hdr.Protocol) + data_offset); 5538 (((char *) &pSMBr->hdr.Protocol) + data_offset);
5507 5539
5508 list_len = le32_to_cpu(ea_response_data->list_len); 5540 list_len = le32_to_cpu(ea_response_data->list_len);
5509 cFYI(1, ("ea length %d", list_len)); 5541 cFYI(1, "ea length %d", list_len);
5510 if (list_len <= 8) { 5542 if (list_len <= 8) {
5511 cFYI(1, ("empty EA list returned from server")); 5543 cFYI(1, "empty EA list returned from server");
5512 goto QAllEAsOut; 5544 goto QAllEAsOut;
5513 } 5545 }
5514 5546
5515 /* make sure list_len doesn't go past end of SMB */ 5547 /* make sure list_len doesn't go past end of SMB */
5516 end_of_smb = (char *)pByteArea(&pSMBr->hdr) + BCC(&pSMBr->hdr); 5548 end_of_smb = (char *)pByteArea(&pSMBr->hdr) + BCC(&pSMBr->hdr);
5517 if ((char *)ea_response_data + list_len > end_of_smb) { 5549 if ((char *)ea_response_data + list_len > end_of_smb) {
5518 cFYI(1, ("EA list appears to go beyond SMB")); 5550 cFYI(1, "EA list appears to go beyond SMB");
5519 rc = -EIO; 5551 rc = -EIO;
5520 goto QAllEAsOut; 5552 goto QAllEAsOut;
5521 } 5553 }
@@ -5532,7 +5564,7 @@ QAllEAsRetry:
5532 temp_ptr += 4; 5564 temp_ptr += 4;
5533 /* make sure we can read name_len and value_len */ 5565 /* make sure we can read name_len and value_len */
5534 if (list_len < 0) { 5566 if (list_len < 0) {
5535 cFYI(1, ("EA entry goes beyond length of list")); 5567 cFYI(1, "EA entry goes beyond length of list");
5536 rc = -EIO; 5568 rc = -EIO;
5537 goto QAllEAsOut; 5569 goto QAllEAsOut;
5538 } 5570 }
@@ -5541,7 +5573,7 @@ QAllEAsRetry:
5541 value_len = le16_to_cpu(temp_fea->value_len); 5573 value_len = le16_to_cpu(temp_fea->value_len);
5542 list_len -= name_len + 1 + value_len; 5574 list_len -= name_len + 1 + value_len;
5543 if (list_len < 0) { 5575 if (list_len < 0) {
5544 cFYI(1, ("EA entry goes beyond length of list")); 5576 cFYI(1, "EA entry goes beyond length of list");
5545 rc = -EIO; 5577 rc = -EIO;
5546 goto QAllEAsOut; 5578 goto QAllEAsOut;
5547 } 5579 }
@@ -5608,7 +5640,7 @@ CIFSSMBSetEA(const int xid, struct cifsTconInfo *tcon, const char *fileName,
5608 int bytes_returned = 0; 5640 int bytes_returned = 0;
5609 __u16 params, param_offset, byte_count, offset, count; 5641 __u16 params, param_offset, byte_count, offset, count;
5610 5642
5611 cFYI(1, ("In SetEA")); 5643 cFYI(1, "In SetEA");
5612SetEARetry: 5644SetEARetry:
5613 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 5645 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
5614 (void **) &pSMBr); 5646 (void **) &pSMBr);
@@ -5690,7 +5722,7 @@ SetEARetry:
5690 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 5722 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
5691 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 5723 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
5692 if (rc) 5724 if (rc)
5693 cFYI(1, ("SetPathInfo (EA) returned %d", rc)); 5725 cFYI(1, "SetPathInfo (EA) returned %d", rc);
5694 5726
5695 cifs_buf_release(pSMB); 5727 cifs_buf_release(pSMB);
5696 5728
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 45eb6cba793f..2208f06e4c45 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -23,6 +23,7 @@
23#include <linux/string.h> 23#include <linux/string.h>
24#include <linux/list.h> 24#include <linux/list.h>
25#include <linux/wait.h> 25#include <linux/wait.h>
26#include <linux/slab.h>
26#include <linux/pagemap.h> 27#include <linux/pagemap.h>
27#include <linux/ctype.h> 28#include <linux/ctype.h>
28#include <linux/utsname.h> 29#include <linux/utsname.h>
@@ -101,6 +102,7 @@ struct smb_vol {
101 bool sockopt_tcp_nodelay:1; 102 bool sockopt_tcp_nodelay:1;
102 unsigned short int port; 103 unsigned short int port;
103 char *prepath; 104 char *prepath;
105 struct nls_table *local_nls;
104}; 106};
105 107
106static int ipv4_connect(struct TCP_Server_Info *server); 108static int ipv4_connect(struct TCP_Server_Info *server);
@@ -134,7 +136,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
134 spin_unlock(&GlobalMid_Lock); 136 spin_unlock(&GlobalMid_Lock);
135 server->maxBuf = 0; 137 server->maxBuf = 0;
136 138
137 cFYI(1, ("Reconnecting tcp session")); 139 cFYI(1, "Reconnecting tcp session");
138 140
139 /* before reconnecting the tcp session, mark the smb session (uid) 141 /* before reconnecting the tcp session, mark the smb session (uid)
140 and the tid bad so they are not used until reconnected */ 142 and the tid bad so they are not used until reconnected */
@@ -152,12 +154,12 @@ cifs_reconnect(struct TCP_Server_Info *server)
152 /* do not want to be sending data on a socket we are freeing */ 154 /* do not want to be sending data on a socket we are freeing */
153 mutex_lock(&server->srv_mutex); 155 mutex_lock(&server->srv_mutex);
154 if (server->ssocket) { 156 if (server->ssocket) {
155 cFYI(1, ("State: 0x%x Flags: 0x%lx", server->ssocket->state, 157 cFYI(1, "State: 0x%x Flags: 0x%lx", server->ssocket->state,
156 server->ssocket->flags)); 158 server->ssocket->flags);
157 kernel_sock_shutdown(server->ssocket, SHUT_WR); 159 kernel_sock_shutdown(server->ssocket, SHUT_WR);
158 cFYI(1, ("Post shutdown state: 0x%x Flags: 0x%lx", 160 cFYI(1, "Post shutdown state: 0x%x Flags: 0x%lx",
159 server->ssocket->state, 161 server->ssocket->state,
160 server->ssocket->flags)); 162 server->ssocket->flags);
161 sock_release(server->ssocket); 163 sock_release(server->ssocket);
162 server->ssocket = NULL; 164 server->ssocket = NULL;
163 } 165 }
@@ -186,7 +188,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
186 else 188 else
187 rc = ipv4_connect(server); 189 rc = ipv4_connect(server);
188 if (rc) { 190 if (rc) {
189 cFYI(1, ("reconnect error %d", rc)); 191 cFYI(1, "reconnect error %d", rc);
190 msleep(3000); 192 msleep(3000);
191 } else { 193 } else {
192 atomic_inc(&tcpSesReconnectCount); 194 atomic_inc(&tcpSesReconnectCount);
@@ -222,7 +224,7 @@ static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize)
222 /* check for plausible wct, bcc and t2 data and parm sizes */ 224 /* check for plausible wct, bcc and t2 data and parm sizes */
223 /* check for parm and data offset going beyond end of smb */ 225 /* check for parm and data offset going beyond end of smb */
224 if (pSMB->WordCount != 10) { /* coalesce_t2 depends on this */ 226 if (pSMB->WordCount != 10) { /* coalesce_t2 depends on this */
225 cFYI(1, ("invalid transact2 word count")); 227 cFYI(1, "invalid transact2 word count");
226 return -EINVAL; 228 return -EINVAL;
227 } 229 }
228 230
@@ -236,15 +238,15 @@ static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize)
236 if (remaining == 0) 238 if (remaining == 0)
237 return 0; 239 return 0;
238 else if (remaining < 0) { 240 else if (remaining < 0) {
239 cFYI(1, ("total data %d smaller than data in frame %d", 241 cFYI(1, "total data %d smaller than data in frame %d",
240 total_data_size, data_in_this_rsp)); 242 total_data_size, data_in_this_rsp);
241 return -EINVAL; 243 return -EINVAL;
242 } else { 244 } else {
243 cFYI(1, ("missing %d bytes from transact2, check next response", 245 cFYI(1, "missing %d bytes from transact2, check next response",
244 remaining)); 246 remaining);
245 if (total_data_size > maxBufSize) { 247 if (total_data_size > maxBufSize) {
246 cERROR(1, ("TotalDataSize %d is over maximum buffer %d", 248 cERROR(1, "TotalDataSize %d is over maximum buffer %d",
247 total_data_size, maxBufSize)); 249 total_data_size, maxBufSize);
248 return -EINVAL; 250 return -EINVAL;
249 } 251 }
250 return remaining; 252 return remaining;
@@ -266,7 +268,7 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
266 total_data_size = le16_to_cpu(pSMBt->t2_rsp.TotalDataCount); 268 total_data_size = le16_to_cpu(pSMBt->t2_rsp.TotalDataCount);
267 269
268 if (total_data_size != le16_to_cpu(pSMB2->t2_rsp.TotalDataCount)) { 270 if (total_data_size != le16_to_cpu(pSMB2->t2_rsp.TotalDataCount)) {
269 cFYI(1, ("total data size of primary and secondary t2 differ")); 271 cFYI(1, "total data size of primary and secondary t2 differ");
270 } 272 }
271 273
272 total_in_buf = le16_to_cpu(pSMBt->t2_rsp.DataCount); 274 total_in_buf = le16_to_cpu(pSMBt->t2_rsp.DataCount);
@@ -281,7 +283,7 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
281 283
282 total_in_buf2 = le16_to_cpu(pSMB2->t2_rsp.DataCount); 284 total_in_buf2 = le16_to_cpu(pSMB2->t2_rsp.DataCount);
283 if (remaining < total_in_buf2) { 285 if (remaining < total_in_buf2) {
284 cFYI(1, ("transact2 2nd response contains too much data")); 286 cFYI(1, "transact2 2nd response contains too much data");
285 } 287 }
286 288
287 /* find end of first SMB data area */ 289 /* find end of first SMB data area */
@@ -310,7 +312,7 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
310 pTargetSMB->smb_buf_length = byte_count; 312 pTargetSMB->smb_buf_length = byte_count;
311 313
312 if (remaining == total_in_buf2) { 314 if (remaining == total_in_buf2) {
313 cFYI(1, ("found the last secondary response")); 315 cFYI(1, "found the last secondary response");
314 return 0; /* we are done */ 316 return 0; /* we are done */
315 } else /* more responses to go */ 317 } else /* more responses to go */
316 return 1; 318 return 1;
@@ -338,7 +340,7 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
338 int reconnect; 340 int reconnect;
339 341
340 current->flags |= PF_MEMALLOC; 342 current->flags |= PF_MEMALLOC;
341 cFYI(1, ("Demultiplex PID: %d", task_pid_nr(current))); 343 cFYI(1, "Demultiplex PID: %d", task_pid_nr(current));
342 344
343 length = atomic_inc_return(&tcpSesAllocCount); 345 length = atomic_inc_return(&tcpSesAllocCount);
344 if (length > 1) 346 if (length > 1)
@@ -352,7 +354,7 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
352 if (bigbuf == NULL) { 354 if (bigbuf == NULL) {
353 bigbuf = cifs_buf_get(); 355 bigbuf = cifs_buf_get();
354 if (!bigbuf) { 356 if (!bigbuf) {
355 cERROR(1, ("No memory for large SMB response")); 357 cERROR(1, "No memory for large SMB response");
356 msleep(3000); 358 msleep(3000);
357 /* retry will check if exiting */ 359 /* retry will check if exiting */
358 continue; 360 continue;
@@ -365,7 +367,7 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
365 if (smallbuf == NULL) { 367 if (smallbuf == NULL) {
366 smallbuf = cifs_small_buf_get(); 368 smallbuf = cifs_small_buf_get();
367 if (!smallbuf) { 369 if (!smallbuf) {
368 cERROR(1, ("No memory for SMB response")); 370 cERROR(1, "No memory for SMB response");
369 msleep(1000); 371 msleep(1000);
370 /* retry will check if exiting */ 372 /* retry will check if exiting */
371 continue; 373 continue;
@@ -390,9 +392,9 @@ incomplete_rcv:
390 if (server->tcpStatus == CifsExiting) { 392 if (server->tcpStatus == CifsExiting) {
391 break; 393 break;
392 } else if (server->tcpStatus == CifsNeedReconnect) { 394 } else if (server->tcpStatus == CifsNeedReconnect) {
393 cFYI(1, ("Reconnect after server stopped responding")); 395 cFYI(1, "Reconnect after server stopped responding");
394 cifs_reconnect(server); 396 cifs_reconnect(server);
395 cFYI(1, ("call to reconnect done")); 397 cFYI(1, "call to reconnect done");
396 csocket = server->ssocket; 398 csocket = server->ssocket;
397 continue; 399 continue;
398 } else if ((length == -ERESTARTSYS) || (length == -EAGAIN)) { 400 } else if ((length == -ERESTARTSYS) || (length == -EAGAIN)) {
@@ -410,7 +412,7 @@ incomplete_rcv:
410 continue; 412 continue;
411 } else if (length <= 0) { 413 } else if (length <= 0) {
412 if (server->tcpStatus == CifsNew) { 414 if (server->tcpStatus == CifsNew) {
413 cFYI(1, ("tcp session abend after SMBnegprot")); 415 cFYI(1, "tcp session abend after SMBnegprot");
414 /* some servers kill the TCP session rather than 416 /* some servers kill the TCP session rather than
415 returning an SMB negprot error, in which 417 returning an SMB negprot error, in which
416 case reconnecting here is not going to help, 418 case reconnecting here is not going to help,
@@ -418,18 +420,18 @@ incomplete_rcv:
418 break; 420 break;
419 } 421 }
420 if (!try_to_freeze() && (length == -EINTR)) { 422 if (!try_to_freeze() && (length == -EINTR)) {
421 cFYI(1, ("cifsd thread killed")); 423 cFYI(1, "cifsd thread killed");
422 break; 424 break;
423 } 425 }
424 cFYI(1, ("Reconnect after unexpected peek error %d", 426 cFYI(1, "Reconnect after unexpected peek error %d",
425 length)); 427 length);
426 cifs_reconnect(server); 428 cifs_reconnect(server);
427 csocket = server->ssocket; 429 csocket = server->ssocket;
428 wake_up(&server->response_q); 430 wake_up(&server->response_q);
429 continue; 431 continue;
430 } else if (length < pdu_length) { 432 } else if (length < pdu_length) {
431 cFYI(1, ("requested %d bytes but only got %d bytes", 433 cFYI(1, "requested %d bytes but only got %d bytes",
432 pdu_length, length)); 434 pdu_length, length);
433 pdu_length -= length; 435 pdu_length -= length;
434 msleep(1); 436 msleep(1);
435 goto incomplete_rcv; 437 goto incomplete_rcv;
@@ -449,18 +451,18 @@ incomplete_rcv:
449 pdu_length = be32_to_cpu((__force __be32)smb_buffer->smb_buf_length); 451 pdu_length = be32_to_cpu((__force __be32)smb_buffer->smb_buf_length);
450 smb_buffer->smb_buf_length = pdu_length; 452 smb_buffer->smb_buf_length = pdu_length;
451 453
452 cFYI(1, ("rfc1002 length 0x%x", pdu_length+4)); 454 cFYI(1, "rfc1002 length 0x%x", pdu_length+4);
453 455
454 if (temp == (char) RFC1002_SESSION_KEEP_ALIVE) { 456 if (temp == (char) RFC1002_SESSION_KEEP_ALIVE) {
455 continue; 457 continue;
456 } else if (temp == (char)RFC1002_POSITIVE_SESSION_RESPONSE) { 458 } else if (temp == (char)RFC1002_POSITIVE_SESSION_RESPONSE) {
457 cFYI(1, ("Good RFC 1002 session rsp")); 459 cFYI(1, "Good RFC 1002 session rsp");
458 continue; 460 continue;
459 } else if (temp == (char)RFC1002_NEGATIVE_SESSION_RESPONSE) { 461 } else if (temp == (char)RFC1002_NEGATIVE_SESSION_RESPONSE) {
460 /* we get this from Windows 98 instead of 462 /* we get this from Windows 98 instead of
461 an error on SMB negprot response */ 463 an error on SMB negprot response */
462 cFYI(1, ("Negative RFC1002 Session Response Error 0x%x)", 464 cFYI(1, "Negative RFC1002 Session Response Error 0x%x)",
463 pdu_length)); 465 pdu_length);
464 if (server->tcpStatus == CifsNew) { 466 if (server->tcpStatus == CifsNew) {
465 /* if nack on negprot (rather than 467 /* if nack on negprot (rather than
466 ret of smb negprot error) reconnecting 468 ret of smb negprot error) reconnecting
@@ -483,7 +485,7 @@ incomplete_rcv:
483 continue; 485 continue;
484 } 486 }
485 } else if (temp != (char) 0) { 487 } else if (temp != (char) 0) {
486 cERROR(1, ("Unknown RFC 1002 frame")); 488 cERROR(1, "Unknown RFC 1002 frame");
487 cifs_dump_mem(" Received Data: ", (char *)smb_buffer, 489 cifs_dump_mem(" Received Data: ", (char *)smb_buffer,
488 length); 490 length);
489 cifs_reconnect(server); 491 cifs_reconnect(server);
@@ -494,8 +496,8 @@ incomplete_rcv:
494 /* else we have an SMB response */ 496 /* else we have an SMB response */
495 if ((pdu_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) || 497 if ((pdu_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) ||
496 (pdu_length < sizeof(struct smb_hdr) - 1 - 4)) { 498 (pdu_length < sizeof(struct smb_hdr) - 1 - 4)) {
497 cERROR(1, ("Invalid size SMB length %d pdu_length %d", 499 cERROR(1, "Invalid size SMB length %d pdu_length %d",
498 length, pdu_length+4)); 500 length, pdu_length+4);
499 cifs_reconnect(server); 501 cifs_reconnect(server);
500 csocket = server->ssocket; 502 csocket = server->ssocket;
501 wake_up(&server->response_q); 503 wake_up(&server->response_q);
@@ -538,8 +540,8 @@ incomplete_rcv:
538 length = 0; 540 length = 0;
539 continue; 541 continue;
540 } else if (length <= 0) { 542 } else if (length <= 0) {
541 cERROR(1, ("Received no data, expecting %d", 543 cERROR(1, "Received no data, expecting %d",
542 pdu_length - total_read)); 544 pdu_length - total_read);
543 cifs_reconnect(server); 545 cifs_reconnect(server);
544 csocket = server->ssocket; 546 csocket = server->ssocket;
545 reconnect = 1; 547 reconnect = 1;
@@ -587,7 +589,7 @@ incomplete_rcv:
587 } 589 }
588 } else { 590 } else {
589 if (!isLargeBuf) { 591 if (!isLargeBuf) {
590 cERROR(1,("1st trans2 resp needs bigbuf")); 592 cERROR(1, "1st trans2 resp needs bigbuf");
591 /* BB maybe we can fix this up, switch 593 /* BB maybe we can fix this up, switch
592 to already allocated large buffer? */ 594 to already allocated large buffer? */
593 } else { 595 } else {
@@ -629,8 +631,8 @@ multi_t2_fnd:
629 wake_up_process(task_to_wake); 631 wake_up_process(task_to_wake);
630 } else if (!is_valid_oplock_break(smb_buffer, server) && 632 } else if (!is_valid_oplock_break(smb_buffer, server) &&
631 !isMultiRsp) { 633 !isMultiRsp) {
632 cERROR(1, ("No task to wake, unknown frame received! " 634 cERROR(1, "No task to wake, unknown frame received! "
633 "NumMids %d", midCount.counter)); 635 "NumMids %d", midCount.counter);
634 cifs_dump_mem("Received Data is: ", (char *)smb_buffer, 636 cifs_dump_mem("Received Data is: ", (char *)smb_buffer,
635 sizeof(struct smb_hdr)); 637 sizeof(struct smb_hdr));
636#ifdef CONFIG_CIFS_DEBUG2 638#ifdef CONFIG_CIFS_DEBUG2
@@ -707,8 +709,8 @@ multi_t2_fnd:
707 list_for_each(tmp, &server->pending_mid_q) { 709 list_for_each(tmp, &server->pending_mid_q) {
708 mid_entry = list_entry(tmp, struct mid_q_entry, qhead); 710 mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
709 if (mid_entry->midState == MID_REQUEST_SUBMITTED) { 711 if (mid_entry->midState == MID_REQUEST_SUBMITTED) {
710 cFYI(1, ("Clearing Mid 0x%x - waking up ", 712 cFYI(1, "Clearing Mid 0x%x - waking up ",
711 mid_entry->mid)); 713 mid_entry->mid);
712 task_to_wake = mid_entry->tsk; 714 task_to_wake = mid_entry->tsk;
713 if (task_to_wake) 715 if (task_to_wake)
714 wake_up_process(task_to_wake); 716 wake_up_process(task_to_wake);
@@ -727,7 +729,7 @@ multi_t2_fnd:
727 to wait at least 45 seconds before giving up 729 to wait at least 45 seconds before giving up
728 on a request getting a response and going ahead 730 on a request getting a response and going ahead
729 and killing cifsd */ 731 and killing cifsd */
730 cFYI(1, ("Wait for exit from demultiplex thread")); 732 cFYI(1, "Wait for exit from demultiplex thread");
731 msleep(46000); 733 msleep(46000);
732 /* if threads still have not exited they are probably never 734 /* if threads still have not exited they are probably never
733 coming home not much else we can do but free the memory */ 735 coming home not much else we can do but free the memory */
@@ -848,7 +850,7 @@ cifs_parse_mount_options(char *options, const char *devname,
848 separator[0] = options[4]; 850 separator[0] = options[4];
849 options += 5; 851 options += 5;
850 } else { 852 } else {
851 cFYI(1, ("Null separator not allowed")); 853 cFYI(1, "Null separator not allowed");
852 } 854 }
853 } 855 }
854 856
@@ -973,7 +975,7 @@ cifs_parse_mount_options(char *options, const char *devname,
973 } 975 }
974 } else if (strnicmp(data, "sec", 3) == 0) { 976 } else if (strnicmp(data, "sec", 3) == 0) {
975 if (!value || !*value) { 977 if (!value || !*value) {
976 cERROR(1, ("no security value specified")); 978 cERROR(1, "no security value specified");
977 continue; 979 continue;
978 } else if (strnicmp(value, "krb5i", 5) == 0) { 980 } else if (strnicmp(value, "krb5i", 5) == 0) {
979 vol->secFlg |= CIFSSEC_MAY_KRB5 | 981 vol->secFlg |= CIFSSEC_MAY_KRB5 |
@@ -981,7 +983,7 @@ cifs_parse_mount_options(char *options, const char *devname,
981 } else if (strnicmp(value, "krb5p", 5) == 0) { 983 } else if (strnicmp(value, "krb5p", 5) == 0) {
982 /* vol->secFlg |= CIFSSEC_MUST_SEAL | 984 /* vol->secFlg |= CIFSSEC_MUST_SEAL |
983 CIFSSEC_MAY_KRB5; */ 985 CIFSSEC_MAY_KRB5; */
984 cERROR(1, ("Krb5 cifs privacy not supported")); 986 cERROR(1, "Krb5 cifs privacy not supported");
985 return 1; 987 return 1;
986 } else if (strnicmp(value, "krb5", 4) == 0) { 988 } else if (strnicmp(value, "krb5", 4) == 0) {
987 vol->secFlg |= CIFSSEC_MAY_KRB5; 989 vol->secFlg |= CIFSSEC_MAY_KRB5;
@@ -1013,7 +1015,7 @@ cifs_parse_mount_options(char *options, const char *devname,
1013 } else if (strnicmp(value, "none", 4) == 0) { 1015 } else if (strnicmp(value, "none", 4) == 0) {
1014 vol->nullauth = 1; 1016 vol->nullauth = 1;
1015 } else { 1017 } else {
1016 cERROR(1, ("bad security option: %s", value)); 1018 cERROR(1, "bad security option: %s", value);
1017 return 1; 1019 return 1;
1018 } 1020 }
1019 } else if ((strnicmp(data, "unc", 3) == 0) 1021 } else if ((strnicmp(data, "unc", 3) == 0)
@@ -1052,7 +1054,7 @@ cifs_parse_mount_options(char *options, const char *devname,
1052 a domain name and need special handling? */ 1054 a domain name and need special handling? */
1053 if (strnlen(value, 256) < 256) { 1055 if (strnlen(value, 256) < 256) {
1054 vol->domainname = value; 1056 vol->domainname = value;
1055 cFYI(1, ("Domain name set")); 1057 cFYI(1, "Domain name set");
1056 } else { 1058 } else {
1057 printk(KERN_WARNING "CIFS: domain name too " 1059 printk(KERN_WARNING "CIFS: domain name too "
1058 "long\n"); 1060 "long\n");
@@ -1075,7 +1077,7 @@ cifs_parse_mount_options(char *options, const char *devname,
1075 strcpy(vol->prepath+1, value); 1077 strcpy(vol->prepath+1, value);
1076 } else 1078 } else
1077 strcpy(vol->prepath, value); 1079 strcpy(vol->prepath, value);
1078 cFYI(1, ("prefix path %s", vol->prepath)); 1080 cFYI(1, "prefix path %s", vol->prepath);
1079 } else { 1081 } else {
1080 printk(KERN_WARNING "CIFS: prefix too long\n"); 1082 printk(KERN_WARNING "CIFS: prefix too long\n");
1081 return 1; 1083 return 1;
@@ -1091,7 +1093,7 @@ cifs_parse_mount_options(char *options, const char *devname,
1091 vol->iocharset = value; 1093 vol->iocharset = value;
1092 /* if iocharset not set then load_nls_default 1094 /* if iocharset not set then load_nls_default
1093 is used by caller */ 1095 is used by caller */
1094 cFYI(1, ("iocharset set to %s", value)); 1096 cFYI(1, "iocharset set to %s", value);
1095 } else { 1097 } else {
1096 printk(KERN_WARNING "CIFS: iocharset name " 1098 printk(KERN_WARNING "CIFS: iocharset name "
1097 "too long.\n"); 1099 "too long.\n");
@@ -1143,14 +1145,14 @@ cifs_parse_mount_options(char *options, const char *devname,
1143 } 1145 }
1144 } else if (strnicmp(data, "sockopt", 5) == 0) { 1146 } else if (strnicmp(data, "sockopt", 5) == 0) {
1145 if (!value || !*value) { 1147 if (!value || !*value) {
1146 cERROR(1, ("no socket option specified")); 1148 cERROR(1, "no socket option specified");
1147 continue; 1149 continue;
1148 } else if (strnicmp(value, "TCP_NODELAY", 11) == 0) { 1150 } else if (strnicmp(value, "TCP_NODELAY", 11) == 0) {
1149 vol->sockopt_tcp_nodelay = 1; 1151 vol->sockopt_tcp_nodelay = 1;
1150 } 1152 }
1151 } else if (strnicmp(data, "netbiosname", 4) == 0) { 1153 } else if (strnicmp(data, "netbiosname", 4) == 0) {
1152 if (!value || !*value || (*value == ' ')) { 1154 if (!value || !*value || (*value == ' ')) {
1153 cFYI(1, ("invalid (empty) netbiosname")); 1155 cFYI(1, "invalid (empty) netbiosname");
1154 } else { 1156 } else {
1155 memset(vol->source_rfc1001_name, 0x20, 15); 1157 memset(vol->source_rfc1001_name, 0x20, 15);
1156 for (i = 0; i < 15; i++) { 1158 for (i = 0; i < 15; i++) {
@@ -1174,7 +1176,7 @@ cifs_parse_mount_options(char *options, const char *devname,
1174 } else if (strnicmp(data, "servern", 7) == 0) { 1176 } else if (strnicmp(data, "servern", 7) == 0) {
1175 /* servernetbiosname specified override *SMBSERVER */ 1177 /* servernetbiosname specified override *SMBSERVER */
1176 if (!value || !*value || (*value == ' ')) { 1178 if (!value || !*value || (*value == ' ')) {
1177 cFYI(1, ("empty server netbiosname specified")); 1179 cFYI(1, "empty server netbiosname specified");
1178 } else { 1180 } else {
1179 /* last byte, type, is 0x20 for servr type */ 1181 /* last byte, type, is 0x20 for servr type */
1180 memset(vol->target_rfc1001_name, 0x20, 16); 1182 memset(vol->target_rfc1001_name, 0x20, 16);
@@ -1433,7 +1435,7 @@ cifs_find_tcp_session(struct sockaddr_storage *addr, unsigned short int port)
1433 1435
1434 ++server->srv_count; 1436 ++server->srv_count;
1435 write_unlock(&cifs_tcp_ses_lock); 1437 write_unlock(&cifs_tcp_ses_lock);
1436 cFYI(1, ("Existing tcp session with server found")); 1438 cFYI(1, "Existing tcp session with server found");
1437 return server; 1439 return server;
1438 } 1440 }
1439 write_unlock(&cifs_tcp_ses_lock); 1441 write_unlock(&cifs_tcp_ses_lock);
@@ -1474,7 +1476,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
1474 1476
1475 memset(&addr, 0, sizeof(struct sockaddr_storage)); 1477 memset(&addr, 0, sizeof(struct sockaddr_storage));
1476 1478
1477 cFYI(1, ("UNC: %s ip: %s", volume_info->UNC, volume_info->UNCip)); 1479 cFYI(1, "UNC: %s ip: %s", volume_info->UNC, volume_info->UNCip);
1478 1480
1479 if (volume_info->UNCip && volume_info->UNC) { 1481 if (volume_info->UNCip && volume_info->UNC) {
1480 rc = cifs_convert_address(volume_info->UNCip, &addr); 1482 rc = cifs_convert_address(volume_info->UNCip, &addr);
@@ -1486,13 +1488,12 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
1486 } else if (volume_info->UNCip) { 1488 } else if (volume_info->UNCip) {
1487 /* BB using ip addr as tcp_ses name to connect to the 1489 /* BB using ip addr as tcp_ses name to connect to the
1488 DFS root below */ 1490 DFS root below */
1489 cERROR(1, ("Connecting to DFS root not implemented yet")); 1491 cERROR(1, "Connecting to DFS root not implemented yet");
1490 rc = -EINVAL; 1492 rc = -EINVAL;
1491 goto out_err; 1493 goto out_err;
1492 } else /* which tcp_sess DFS root would we conect to */ { 1494 } else /* which tcp_sess DFS root would we conect to */ {
1493 cERROR(1, 1495 cERROR(1, "CIFS mount error: No UNC path (e.g. -o "
1494 ("CIFS mount error: No UNC path (e.g. -o " 1496 "unc=//192.168.1.100/public) specified");
1495 "unc=//192.168.1.100/public) specified"));
1496 rc = -EINVAL; 1497 rc = -EINVAL;
1497 goto out_err; 1498 goto out_err;
1498 } 1499 }
@@ -1539,7 +1540,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
1539 ++tcp_ses->srv_count; 1540 ++tcp_ses->srv_count;
1540 1541
1541 if (addr.ss_family == AF_INET6) { 1542 if (addr.ss_family == AF_INET6) {
1542 cFYI(1, ("attempting ipv6 connect")); 1543 cFYI(1, "attempting ipv6 connect");
1543 /* BB should we allow ipv6 on port 139? */ 1544 /* BB should we allow ipv6 on port 139? */
1544 /* other OS never observed in Wild doing 139 with v6 */ 1545 /* other OS never observed in Wild doing 139 with v6 */
1545 sin_server6->sin6_port = htons(volume_info->port); 1546 sin_server6->sin6_port = htons(volume_info->port);
@@ -1553,7 +1554,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
1553 rc = ipv4_connect(tcp_ses); 1554 rc = ipv4_connect(tcp_ses);
1554 } 1555 }
1555 if (rc < 0) { 1556 if (rc < 0) {
1556 cERROR(1, ("Error connecting to socket. Aborting operation")); 1557 cERROR(1, "Error connecting to socket. Aborting operation");
1557 goto out_err; 1558 goto out_err;
1558 } 1559 }
1559 1560
@@ -1566,7 +1567,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
1566 tcp_ses, "cifsd"); 1567 tcp_ses, "cifsd");
1567 if (IS_ERR(tcp_ses->tsk)) { 1568 if (IS_ERR(tcp_ses->tsk)) {
1568 rc = PTR_ERR(tcp_ses->tsk); 1569 rc = PTR_ERR(tcp_ses->tsk);
1569 cERROR(1, ("error %d create cifsd thread", rc)); 1570 cERROR(1, "error %d create cifsd thread", rc);
1570 module_put(THIS_MODULE); 1571 module_put(THIS_MODULE);
1571 goto out_err; 1572 goto out_err;
1572 } 1573 }
@@ -1615,6 +1616,7 @@ cifs_put_smb_ses(struct cifsSesInfo *ses)
1615 int xid; 1616 int xid;
1616 struct TCP_Server_Info *server = ses->server; 1617 struct TCP_Server_Info *server = ses->server;
1617 1618
1619 cFYI(1, "%s: ses_count=%d\n", __func__, ses->ses_count);
1618 write_lock(&cifs_tcp_ses_lock); 1620 write_lock(&cifs_tcp_ses_lock);
1619 if (--ses->ses_count > 0) { 1621 if (--ses->ses_count > 0) {
1620 write_unlock(&cifs_tcp_ses_lock); 1622 write_unlock(&cifs_tcp_ses_lock);
@@ -1633,6 +1635,102 @@ cifs_put_smb_ses(struct cifsSesInfo *ses)
1633 cifs_put_tcp_session(server); 1635 cifs_put_tcp_session(server);
1634} 1636}
1635 1637
1638static struct cifsSesInfo *
1639cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
1640{
1641 int rc = -ENOMEM, xid;
1642 struct cifsSesInfo *ses;
1643
1644 xid = GetXid();
1645
1646 ses = cifs_find_smb_ses(server, volume_info->username);
1647 if (ses) {
1648 cFYI(1, "Existing smb sess found (status=%d)", ses->status);
1649
1650 /* existing SMB ses has a server reference already */
1651 cifs_put_tcp_session(server);
1652
1653 mutex_lock(&ses->session_mutex);
1654 rc = cifs_negotiate_protocol(xid, ses);
1655 if (rc) {
1656 mutex_unlock(&ses->session_mutex);
1657 /* problem -- put our ses reference */
1658 cifs_put_smb_ses(ses);
1659 FreeXid(xid);
1660 return ERR_PTR(rc);
1661 }
1662 if (ses->need_reconnect) {
1663 cFYI(1, "Session needs reconnect");
1664 rc = cifs_setup_session(xid, ses,
1665 volume_info->local_nls);
1666 if (rc) {
1667 mutex_unlock(&ses->session_mutex);
1668 /* problem -- put our reference */
1669 cifs_put_smb_ses(ses);
1670 FreeXid(xid);
1671 return ERR_PTR(rc);
1672 }
1673 }
1674 mutex_unlock(&ses->session_mutex);
1675 FreeXid(xid);
1676 return ses;
1677 }
1678
1679 cFYI(1, "Existing smb sess not found");
1680 ses = sesInfoAlloc();
1681 if (ses == NULL)
1682 goto get_ses_fail;
1683
1684 /* new SMB session uses our server ref */
1685 ses->server = server;
1686 if (server->addr.sockAddr6.sin6_family == AF_INET6)
1687 sprintf(ses->serverName, "%pI6",
1688 &server->addr.sockAddr6.sin6_addr);
1689 else
1690 sprintf(ses->serverName, "%pI4",
1691 &server->addr.sockAddr.sin_addr.s_addr);
1692
1693 if (volume_info->username)
1694 strncpy(ses->userName, volume_info->username,
1695 MAX_USERNAME_SIZE);
1696
1697 /* volume_info->password freed at unmount */
1698 if (volume_info->password) {
1699 ses->password = kstrdup(volume_info->password, GFP_KERNEL);
1700 if (!ses->password)
1701 goto get_ses_fail;
1702 }
1703 if (volume_info->domainname) {
1704 int len = strlen(volume_info->domainname);
1705 ses->domainName = kmalloc(len + 1, GFP_KERNEL);
1706 if (ses->domainName)
1707 strcpy(ses->domainName, volume_info->domainname);
1708 }
1709 ses->linux_uid = volume_info->linux_uid;
1710 ses->overrideSecFlg = volume_info->secFlg;
1711
1712 mutex_lock(&ses->session_mutex);
1713 rc = cifs_negotiate_protocol(xid, ses);
1714 if (!rc)
1715 rc = cifs_setup_session(xid, ses, volume_info->local_nls);
1716 mutex_unlock(&ses->session_mutex);
1717 if (rc)
1718 goto get_ses_fail;
1719
1720 /* success, put it on the list */
1721 write_lock(&cifs_tcp_ses_lock);
1722 list_add(&ses->smb_ses_list, &server->smb_ses_list);
1723 write_unlock(&cifs_tcp_ses_lock);
1724
1725 FreeXid(xid);
1726 return ses;
1727
1728get_ses_fail:
1729 sesInfoFree(ses);
1730 FreeXid(xid);
1731 return ERR_PTR(rc);
1732}
1733
1636static struct cifsTconInfo * 1734static struct cifsTconInfo *
1637cifs_find_tcon(struct cifsSesInfo *ses, const char *unc) 1735cifs_find_tcon(struct cifsSesInfo *ses, const char *unc)
1638{ 1736{
@@ -1661,6 +1759,7 @@ cifs_put_tcon(struct cifsTconInfo *tcon)
1661 int xid; 1759 int xid;
1662 struct cifsSesInfo *ses = tcon->ses; 1760 struct cifsSesInfo *ses = tcon->ses;
1663 1761
1762 cFYI(1, "%s: tc_count=%d\n", __func__, tcon->tc_count);
1664 write_lock(&cifs_tcp_ses_lock); 1763 write_lock(&cifs_tcp_ses_lock);
1665 if (--tcon->tc_count > 0) { 1764 if (--tcon->tc_count > 0) {
1666 write_unlock(&cifs_tcp_ses_lock); 1765 write_unlock(&cifs_tcp_ses_lock);
@@ -1678,6 +1777,80 @@ cifs_put_tcon(struct cifsTconInfo *tcon)
1678 cifs_put_smb_ses(ses); 1777 cifs_put_smb_ses(ses);
1679} 1778}
1680 1779
1780static struct cifsTconInfo *
1781cifs_get_tcon(struct cifsSesInfo *ses, struct smb_vol *volume_info)
1782{
1783 int rc, xid;
1784 struct cifsTconInfo *tcon;
1785
1786 tcon = cifs_find_tcon(ses, volume_info->UNC);
1787 if (tcon) {
1788 cFYI(1, "Found match on UNC path");
1789 /* existing tcon already has a reference */
1790 cifs_put_smb_ses(ses);
1791 if (tcon->seal != volume_info->seal)
1792 cERROR(1, "transport encryption setting "
1793 "conflicts with existing tid");
1794 return tcon;
1795 }
1796
1797 tcon = tconInfoAlloc();
1798 if (tcon == NULL) {
1799 rc = -ENOMEM;
1800 goto out_fail;
1801 }
1802
1803 tcon->ses = ses;
1804 if (volume_info->password) {
1805 tcon->password = kstrdup(volume_info->password, GFP_KERNEL);
1806 if (!tcon->password) {
1807 rc = -ENOMEM;
1808 goto out_fail;
1809 }
1810 }
1811
1812 if (strchr(volume_info->UNC + 3, '\\') == NULL
1813 && strchr(volume_info->UNC + 3, '/') == NULL) {
1814 cERROR(1, "Missing share name");
1815 rc = -ENODEV;
1816 goto out_fail;
1817 }
1818
1819 /* BB Do we need to wrap session_mutex around
1820 * this TCon call and Unix SetFS as
1821 * we do on SessSetup and reconnect? */
1822 xid = GetXid();
1823 rc = CIFSTCon(xid, ses, volume_info->UNC, tcon, volume_info->local_nls);
1824 FreeXid(xid);
1825 cFYI(1, "CIFS Tcon rc = %d", rc);
1826 if (rc)
1827 goto out_fail;
1828
1829 if (volume_info->nodfs) {
1830 tcon->Flags &= ~SMB_SHARE_IS_IN_DFS;
1831 cFYI(1, "DFS disabled (%d)", tcon->Flags);
1832 }
1833 tcon->seal = volume_info->seal;
1834 /* we can have only one retry value for a connection
1835 to a share so for resources mounted more than once
1836 to the same server share the last value passed in
1837 for the retry flag is used */
1838 tcon->retry = volume_info->retry;
1839 tcon->nocase = volume_info->nocase;
1840 tcon->local_lease = volume_info->local_lease;
1841
1842 write_lock(&cifs_tcp_ses_lock);
1843 list_add(&tcon->tcon_list, &ses->tcon_list);
1844 write_unlock(&cifs_tcp_ses_lock);
1845
1846 return tcon;
1847
1848out_fail:
1849 tconInfoFree(tcon);
1850 return ERR_PTR(rc);
1851}
1852
1853
1681int 1854int
1682get_dfs_path(int xid, struct cifsSesInfo *pSesInfo, const char *old_path, 1855get_dfs_path(int xid, struct cifsSesInfo *pSesInfo, const char *old_path,
1683 const struct nls_table *nls_codepage, unsigned int *pnum_referrals, 1856 const struct nls_table *nls_codepage, unsigned int *pnum_referrals,
@@ -1702,8 +1875,7 @@ get_dfs_path(int xid, struct cifsSesInfo *pSesInfo, const char *old_path,
1702 strcpy(temp_unc + 2, pSesInfo->serverName); 1875 strcpy(temp_unc + 2, pSesInfo->serverName);
1703 strcpy(temp_unc + 2 + strlen(pSesInfo->serverName), "\\IPC$"); 1876 strcpy(temp_unc + 2 + strlen(pSesInfo->serverName), "\\IPC$");
1704 rc = CIFSTCon(xid, pSesInfo, temp_unc, NULL, nls_codepage); 1877 rc = CIFSTCon(xid, pSesInfo, temp_unc, NULL, nls_codepage);
1705 cFYI(1, 1878 cFYI(1, "CIFS Tcon rc = %d ipc_tid = %d", rc, pSesInfo->ipc_tid);
1706 ("CIFS Tcon rc = %d ipc_tid = %d", rc, pSesInfo->ipc_tid));
1707 kfree(temp_unc); 1879 kfree(temp_unc);
1708 } 1880 }
1709 if (rc == 0) 1881 if (rc == 0)
@@ -1776,12 +1948,12 @@ ipv4_connect(struct TCP_Server_Info *server)
1776 rc = sock_create_kern(PF_INET, SOCK_STREAM, 1948 rc = sock_create_kern(PF_INET, SOCK_STREAM,
1777 IPPROTO_TCP, &socket); 1949 IPPROTO_TCP, &socket);
1778 if (rc < 0) { 1950 if (rc < 0) {
1779 cERROR(1, ("Error %d creating socket", rc)); 1951 cERROR(1, "Error %d creating socket", rc);
1780 return rc; 1952 return rc;
1781 } 1953 }
1782 1954
1783 /* BB other socket options to set KEEPALIVE, NODELAY? */ 1955 /* BB other socket options to set KEEPALIVE, NODELAY? */
1784 cFYI(1, ("Socket created")); 1956 cFYI(1, "Socket created");
1785 server->ssocket = socket; 1957 server->ssocket = socket;
1786 socket->sk->sk_allocation = GFP_NOFS; 1958 socket->sk->sk_allocation = GFP_NOFS;
1787 cifs_reclassify_socket4(socket); 1959 cifs_reclassify_socket4(socket);
@@ -1826,7 +1998,7 @@ ipv4_connect(struct TCP_Server_Info *server)
1826 if (!connected) { 1998 if (!connected) {
1827 if (orig_port) 1999 if (orig_port)
1828 server->addr.sockAddr.sin_port = orig_port; 2000 server->addr.sockAddr.sin_port = orig_port;
1829 cFYI(1, ("Error %d connecting to server via ipv4", rc)); 2001 cFYI(1, "Error %d connecting to server via ipv4", rc);
1830 sock_release(socket); 2002 sock_release(socket);
1831 server->ssocket = NULL; 2003 server->ssocket = NULL;
1832 return rc; 2004 return rc;
@@ -1854,12 +2026,12 @@ ipv4_connect(struct TCP_Server_Info *server)
1854 rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY, 2026 rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY,
1855 (char *)&val, sizeof(val)); 2027 (char *)&val, sizeof(val));
1856 if (rc) 2028 if (rc)
1857 cFYI(1, ("set TCP_NODELAY socket option error %d", rc)); 2029 cFYI(1, "set TCP_NODELAY socket option error %d", rc);
1858 } 2030 }
1859 2031
1860 cFYI(1, ("sndbuf %d rcvbuf %d rcvtimeo 0x%lx", 2032 cFYI(1, "sndbuf %d rcvbuf %d rcvtimeo 0x%lx",
1861 socket->sk->sk_sndbuf, 2033 socket->sk->sk_sndbuf,
1862 socket->sk->sk_rcvbuf, socket->sk->sk_rcvtimeo)); 2034 socket->sk->sk_rcvbuf, socket->sk->sk_rcvtimeo);
1863 2035
1864 /* send RFC1001 sessinit */ 2036 /* send RFC1001 sessinit */
1865 if (server->addr.sockAddr.sin_port == htons(RFC1001_PORT)) { 2037 if (server->addr.sockAddr.sin_port == htons(RFC1001_PORT)) {
@@ -1937,13 +2109,13 @@ ipv6_connect(struct TCP_Server_Info *server)
1937 rc = sock_create_kern(PF_INET6, SOCK_STREAM, 2109 rc = sock_create_kern(PF_INET6, SOCK_STREAM,
1938 IPPROTO_TCP, &socket); 2110 IPPROTO_TCP, &socket);
1939 if (rc < 0) { 2111 if (rc < 0) {
1940 cERROR(1, ("Error %d creating ipv6 socket", rc)); 2112 cERROR(1, "Error %d creating ipv6 socket", rc);
1941 socket = NULL; 2113 socket = NULL;
1942 return rc; 2114 return rc;
1943 } 2115 }
1944 2116
1945 /* BB other socket options to set KEEPALIVE, NODELAY? */ 2117 /* BB other socket options to set KEEPALIVE, NODELAY? */
1946 cFYI(1, ("ipv6 Socket created")); 2118 cFYI(1, "ipv6 Socket created");
1947 server->ssocket = socket; 2119 server->ssocket = socket;
1948 socket->sk->sk_allocation = GFP_NOFS; 2120 socket->sk->sk_allocation = GFP_NOFS;
1949 cifs_reclassify_socket6(socket); 2121 cifs_reclassify_socket6(socket);
@@ -1987,7 +2159,7 @@ ipv6_connect(struct TCP_Server_Info *server)
1987 if (!connected) { 2159 if (!connected) {
1988 if (orig_port) 2160 if (orig_port)
1989 server->addr.sockAddr6.sin6_port = orig_port; 2161 server->addr.sockAddr6.sin6_port = orig_port;
1990 cFYI(1, ("Error %d connecting to server via ipv6", rc)); 2162 cFYI(1, "Error %d connecting to server via ipv6", rc);
1991 sock_release(socket); 2163 sock_release(socket);
1992 server->ssocket = NULL; 2164 server->ssocket = NULL;
1993 return rc; 2165 return rc;
@@ -2006,7 +2178,7 @@ ipv6_connect(struct TCP_Server_Info *server)
2006 rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY, 2178 rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY,
2007 (char *)&val, sizeof(val)); 2179 (char *)&val, sizeof(val));
2008 if (rc) 2180 if (rc)
2009 cFYI(1, ("set TCP_NODELAY socket option error %d", rc)); 2181 cFYI(1, "set TCP_NODELAY socket option error %d", rc);
2010 } 2182 }
2011 2183
2012 server->ssocket = socket; 2184 server->ssocket = socket;
@@ -2031,13 +2203,13 @@ void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
2031 if (vol_info && vol_info->no_linux_ext) { 2203 if (vol_info && vol_info->no_linux_ext) {
2032 tcon->fsUnixInfo.Capability = 0; 2204 tcon->fsUnixInfo.Capability = 0;
2033 tcon->unix_ext = 0; /* Unix Extensions disabled */ 2205 tcon->unix_ext = 0; /* Unix Extensions disabled */
2034 cFYI(1, ("Linux protocol extensions disabled")); 2206 cFYI(1, "Linux protocol extensions disabled");
2035 return; 2207 return;
2036 } else if (vol_info) 2208 } else if (vol_info)
2037 tcon->unix_ext = 1; /* Unix Extensions supported */ 2209 tcon->unix_ext = 1; /* Unix Extensions supported */
2038 2210
2039 if (tcon->unix_ext == 0) { 2211 if (tcon->unix_ext == 0) {
2040 cFYI(1, ("Unix extensions disabled so not set on reconnect")); 2212 cFYI(1, "Unix extensions disabled so not set on reconnect");
2041 return; 2213 return;
2042 } 2214 }
2043 2215
@@ -2053,12 +2225,11 @@ void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
2053 cap &= ~CIFS_UNIX_POSIX_ACL_CAP; 2225 cap &= ~CIFS_UNIX_POSIX_ACL_CAP;
2054 if ((saved_cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) == 0) { 2226 if ((saved_cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) == 0) {
2055 if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) 2227 if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP)
2056 cERROR(1, ("POSIXPATH support change")); 2228 cERROR(1, "POSIXPATH support change");
2057 cap &= ~CIFS_UNIX_POSIX_PATHNAMES_CAP; 2229 cap &= ~CIFS_UNIX_POSIX_PATHNAMES_CAP;
2058 } else if ((cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) == 0) { 2230 } else if ((cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) == 0) {
2059 cERROR(1, ("possible reconnect error")); 2231 cERROR(1, "possible reconnect error");
2060 cERROR(1, 2232 cERROR(1, "server disabled POSIX path support");
2061 ("server disabled POSIX path support"));
2062 } 2233 }
2063 } 2234 }
2064 2235
@@ -2066,7 +2237,7 @@ void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
2066 if (vol_info && vol_info->no_psx_acl) 2237 if (vol_info && vol_info->no_psx_acl)
2067 cap &= ~CIFS_UNIX_POSIX_ACL_CAP; 2238 cap &= ~CIFS_UNIX_POSIX_ACL_CAP;
2068 else if (CIFS_UNIX_POSIX_ACL_CAP & cap) { 2239 else if (CIFS_UNIX_POSIX_ACL_CAP & cap) {
2069 cFYI(1, ("negotiated posix acl support")); 2240 cFYI(1, "negotiated posix acl support");
2070 if (sb) 2241 if (sb)
2071 sb->s_flags |= MS_POSIXACL; 2242 sb->s_flags |= MS_POSIXACL;
2072 } 2243 }
@@ -2074,7 +2245,7 @@ void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
2074 if (vol_info && vol_info->posix_paths == 0) 2245 if (vol_info && vol_info->posix_paths == 0)
2075 cap &= ~CIFS_UNIX_POSIX_PATHNAMES_CAP; 2246 cap &= ~CIFS_UNIX_POSIX_PATHNAMES_CAP;
2076 else if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) { 2247 else if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) {
2077 cFYI(1, ("negotiate posix pathnames")); 2248 cFYI(1, "negotiate posix pathnames");
2078 if (sb) 2249 if (sb)
2079 CIFS_SB(sb)->mnt_cifs_flags |= 2250 CIFS_SB(sb)->mnt_cifs_flags |=
2080 CIFS_MOUNT_POSIX_PATHS; 2251 CIFS_MOUNT_POSIX_PATHS;
@@ -2089,39 +2260,38 @@ void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
2089 if (sb && (CIFS_SB(sb)->rsize > 127 * 1024)) { 2260 if (sb && (CIFS_SB(sb)->rsize > 127 * 1024)) {
2090 if ((cap & CIFS_UNIX_LARGE_READ_CAP) == 0) { 2261 if ((cap & CIFS_UNIX_LARGE_READ_CAP) == 0) {
2091 CIFS_SB(sb)->rsize = 127 * 1024; 2262 CIFS_SB(sb)->rsize = 127 * 1024;
2092 cFYI(DBG2, 2263 cFYI(DBG2, "larger reads not supported by srv");
2093 ("larger reads not supported by srv"));
2094 } 2264 }
2095 } 2265 }
2096 2266
2097 2267
2098 cFYI(1, ("Negotiate caps 0x%x", (int)cap)); 2268 cFYI(1, "Negotiate caps 0x%x", (int)cap);
2099#ifdef CONFIG_CIFS_DEBUG2 2269#ifdef CONFIG_CIFS_DEBUG2
2100 if (cap & CIFS_UNIX_FCNTL_CAP) 2270 if (cap & CIFS_UNIX_FCNTL_CAP)
2101 cFYI(1, ("FCNTL cap")); 2271 cFYI(1, "FCNTL cap");
2102 if (cap & CIFS_UNIX_EXTATTR_CAP) 2272 if (cap & CIFS_UNIX_EXTATTR_CAP)
2103 cFYI(1, ("EXTATTR cap")); 2273 cFYI(1, "EXTATTR cap");
2104 if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) 2274 if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP)
2105 cFYI(1, ("POSIX path cap")); 2275 cFYI(1, "POSIX path cap");
2106 if (cap & CIFS_UNIX_XATTR_CAP) 2276 if (cap & CIFS_UNIX_XATTR_CAP)
2107 cFYI(1, ("XATTR cap")); 2277 cFYI(1, "XATTR cap");
2108 if (cap & CIFS_UNIX_POSIX_ACL_CAP) 2278 if (cap & CIFS_UNIX_POSIX_ACL_CAP)
2109 cFYI(1, ("POSIX ACL cap")); 2279 cFYI(1, "POSIX ACL cap");
2110 if (cap & CIFS_UNIX_LARGE_READ_CAP) 2280 if (cap & CIFS_UNIX_LARGE_READ_CAP)
2111 cFYI(1, ("very large read cap")); 2281 cFYI(1, "very large read cap");
2112 if (cap & CIFS_UNIX_LARGE_WRITE_CAP) 2282 if (cap & CIFS_UNIX_LARGE_WRITE_CAP)
2113 cFYI(1, ("very large write cap")); 2283 cFYI(1, "very large write cap");
2114#endif /* CIFS_DEBUG2 */ 2284#endif /* CIFS_DEBUG2 */
2115 if (CIFSSMBSetFSUnixInfo(xid, tcon, cap)) { 2285 if (CIFSSMBSetFSUnixInfo(xid, tcon, cap)) {
2116 if (vol_info == NULL) { 2286 if (vol_info == NULL) {
2117 cFYI(1, ("resetting capabilities failed")); 2287 cFYI(1, "resetting capabilities failed");
2118 } else 2288 } else
2119 cERROR(1, ("Negotiating Unix capabilities " 2289 cERROR(1, "Negotiating Unix capabilities "
2120 "with the server failed. Consider " 2290 "with the server failed. Consider "
2121 "mounting with the Unix Extensions\n" 2291 "mounting with the Unix Extensions\n"
2122 "disabled, if problems are found, " 2292 "disabled, if problems are found, "
2123 "by specifying the nounix mount " 2293 "by specifying the nounix mount "
2124 "option.")); 2294 "option.");
2125 2295
2126 } 2296 }
2127 } 2297 }
@@ -2151,8 +2321,8 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
2151 struct cifs_sb_info *cifs_sb) 2321 struct cifs_sb_info *cifs_sb)
2152{ 2322{
2153 if (pvolume_info->rsize > CIFSMaxBufSize) { 2323 if (pvolume_info->rsize > CIFSMaxBufSize) {
2154 cERROR(1, ("rsize %d too large, using MaxBufSize", 2324 cERROR(1, "rsize %d too large, using MaxBufSize",
2155 pvolume_info->rsize)); 2325 pvolume_info->rsize);
2156 cifs_sb->rsize = CIFSMaxBufSize; 2326 cifs_sb->rsize = CIFSMaxBufSize;
2157 } else if ((pvolume_info->rsize) && 2327 } else if ((pvolume_info->rsize) &&
2158 (pvolume_info->rsize <= CIFSMaxBufSize)) 2328 (pvolume_info->rsize <= CIFSMaxBufSize))
@@ -2161,8 +2331,8 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
2161 cifs_sb->rsize = CIFSMaxBufSize; 2331 cifs_sb->rsize = CIFSMaxBufSize;
2162 2332
2163 if (pvolume_info->wsize > PAGEVEC_SIZE * PAGE_CACHE_SIZE) { 2333 if (pvolume_info->wsize > PAGEVEC_SIZE * PAGE_CACHE_SIZE) {
2164 cERROR(1, ("wsize %d too large, using 4096 instead", 2334 cERROR(1, "wsize %d too large, using 4096 instead",
2165 pvolume_info->wsize)); 2335 pvolume_info->wsize);
2166 cifs_sb->wsize = 4096; 2336 cifs_sb->wsize = 4096;
2167 } else if (pvolume_info->wsize) 2337 } else if (pvolume_info->wsize)
2168 cifs_sb->wsize = pvolume_info->wsize; 2338 cifs_sb->wsize = pvolume_info->wsize;
@@ -2180,7 +2350,7 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
2180 if (cifs_sb->rsize < 2048) { 2350 if (cifs_sb->rsize < 2048) {
2181 cifs_sb->rsize = 2048; 2351 cifs_sb->rsize = 2048;
2182 /* Windows ME may prefer this */ 2352 /* Windows ME may prefer this */
2183 cFYI(1, ("readsize set to minimum: 2048")); 2353 cFYI(1, "readsize set to minimum: 2048");
2184 } 2354 }
2185 /* calculate prepath */ 2355 /* calculate prepath */
2186 cifs_sb->prepath = pvolume_info->prepath; 2356 cifs_sb->prepath = pvolume_info->prepath;
@@ -2198,8 +2368,8 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
2198 cifs_sb->mnt_gid = pvolume_info->linux_gid; 2368 cifs_sb->mnt_gid = pvolume_info->linux_gid;
2199 cifs_sb->mnt_file_mode = pvolume_info->file_mode; 2369 cifs_sb->mnt_file_mode = pvolume_info->file_mode;
2200 cifs_sb->mnt_dir_mode = pvolume_info->dir_mode; 2370 cifs_sb->mnt_dir_mode = pvolume_info->dir_mode;
2201 cFYI(1, ("file mode: 0x%x dir mode: 0x%x", 2371 cFYI(1, "file mode: 0x%x dir mode: 0x%x",
2202 cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode)); 2372 cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode);
2203 2373
2204 if (pvolume_info->noperm) 2374 if (pvolume_info->noperm)
2205 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_PERM; 2375 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_PERM;
@@ -2228,13 +2398,13 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
2228 if (pvolume_info->dynperm) 2398 if (pvolume_info->dynperm)
2229 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DYNPERM; 2399 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DYNPERM;
2230 if (pvolume_info->direct_io) { 2400 if (pvolume_info->direct_io) {
2231 cFYI(1, ("mounting share using direct i/o")); 2401 cFYI(1, "mounting share using direct i/o");
2232 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO; 2402 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO;
2233 } 2403 }
2234 2404
2235 if ((pvolume_info->cifs_acl) && (pvolume_info->dynperm)) 2405 if ((pvolume_info->cifs_acl) && (pvolume_info->dynperm))
2236 cERROR(1, ("mount option dynperm ignored if cifsacl " 2406 cERROR(1, "mount option dynperm ignored if cifsacl "
2237 "mount option supported")); 2407 "mount option supported");
2238} 2408}
2239 2409
2240static int 2410static int
@@ -2261,7 +2431,7 @@ cleanup_volume_info(struct smb_vol **pvolume_info)
2261{ 2431{
2262 struct smb_vol *volume_info; 2432 struct smb_vol *volume_info;
2263 2433
2264 if (!pvolume_info && !*pvolume_info) 2434 if (!pvolume_info || !*pvolume_info)
2265 return; 2435 return;
2266 2436
2267 volume_info = *pvolume_info; 2437 volume_info = *pvolume_info;
@@ -2343,11 +2513,11 @@ try_mount_again:
2343 } 2513 }
2344 2514
2345 if (volume_info->nullauth) { 2515 if (volume_info->nullauth) {
2346 cFYI(1, ("null user")); 2516 cFYI(1, "null user");
2347 volume_info->username = ""; 2517 volume_info->username = "";
2348 } else if (volume_info->username) { 2518 } else if (volume_info->username) {
2349 /* BB fixme parse for domain name here */ 2519 /* BB fixme parse for domain name here */
2350 cFYI(1, ("Username: %s", volume_info->username)); 2520 cFYI(1, "Username: %s", volume_info->username);
2351 } else { 2521 } else {
2352 cifserror("No username specified"); 2522 cifserror("No username specified");
2353 /* In userspace mount helper we can get user name from alternate 2523 /* In userspace mount helper we can get user name from alternate
@@ -2356,20 +2526,20 @@ try_mount_again:
2356 goto out; 2526 goto out;
2357 } 2527 }
2358 2528
2359
2360 /* this is needed for ASCII cp to Unicode converts */ 2529 /* this is needed for ASCII cp to Unicode converts */
2361 if (volume_info->iocharset == NULL) { 2530 if (volume_info->iocharset == NULL) {
2362 cifs_sb->local_nls = load_nls_default(); 2531 /* load_nls_default cannot return null */
2363 /* load_nls_default can not return null */ 2532 volume_info->local_nls = load_nls_default();
2364 } else { 2533 } else {
2365 cifs_sb->local_nls = load_nls(volume_info->iocharset); 2534 volume_info->local_nls = load_nls(volume_info->iocharset);
2366 if (cifs_sb->local_nls == NULL) { 2535 if (volume_info->local_nls == NULL) {
2367 cERROR(1, ("CIFS mount error: iocharset %s not found", 2536 cERROR(1, "CIFS mount error: iocharset %s not found",
2368 volume_info->iocharset)); 2537 volume_info->iocharset);
2369 rc = -ELIBACC; 2538 rc = -ELIBACC;
2370 goto out; 2539 goto out;
2371 } 2540 }
2372 } 2541 }
2542 cifs_sb->local_nls = volume_info->local_nls;
2373 2543
2374 /* get a reference to a tcp session */ 2544 /* get a reference to a tcp session */
2375 srvTcp = cifs_get_tcp_session(volume_info); 2545 srvTcp = cifs_get_tcp_session(volume_info);
@@ -2378,148 +2548,30 @@ try_mount_again:
2378 goto out; 2548 goto out;
2379 } 2549 }
2380 2550
2381 pSesInfo = cifs_find_smb_ses(srvTcp, volume_info->username); 2551 /* get a reference to a SMB session */
2382 if (pSesInfo) { 2552 pSesInfo = cifs_get_smb_ses(srvTcp, volume_info);
2383 cFYI(1, ("Existing smb sess found (status=%d)", 2553 if (IS_ERR(pSesInfo)) {
2384 pSesInfo->status)); 2554 rc = PTR_ERR(pSesInfo);
2385 /* 2555 pSesInfo = NULL;
2386 * The existing SMB session already has a reference to srvTcp, 2556 goto mount_fail_check;
2387 * so we can put back the extra one we got before
2388 */
2389 cifs_put_tcp_session(srvTcp);
2390
2391 mutex_lock(&pSesInfo->session_mutex);
2392 if (pSesInfo->need_reconnect) {
2393 cFYI(1, ("Session needs reconnect"));
2394 rc = cifs_setup_session(xid, pSesInfo,
2395 cifs_sb->local_nls);
2396 }
2397 mutex_unlock(&pSesInfo->session_mutex);
2398 } else if (!rc) {
2399 cFYI(1, ("Existing smb sess not found"));
2400 pSesInfo = sesInfoAlloc();
2401 if (pSesInfo == NULL) {
2402 rc = -ENOMEM;
2403 goto mount_fail_check;
2404 }
2405
2406 /* new SMB session uses our srvTcp ref */
2407 pSesInfo->server = srvTcp;
2408 if (srvTcp->addr.sockAddr6.sin6_family == AF_INET6)
2409 sprintf(pSesInfo->serverName, "%pI6",
2410 &srvTcp->addr.sockAddr6.sin6_addr);
2411 else
2412 sprintf(pSesInfo->serverName, "%pI4",
2413 &srvTcp->addr.sockAddr.sin_addr.s_addr);
2414
2415 write_lock(&cifs_tcp_ses_lock);
2416 list_add(&pSesInfo->smb_ses_list, &srvTcp->smb_ses_list);
2417 write_unlock(&cifs_tcp_ses_lock);
2418
2419 /* volume_info->password freed at unmount */
2420 if (volume_info->password) {
2421 pSesInfo->password = kstrdup(volume_info->password,
2422 GFP_KERNEL);
2423 if (!pSesInfo->password) {
2424 rc = -ENOMEM;
2425 goto mount_fail_check;
2426 }
2427 }
2428 if (volume_info->username)
2429 strncpy(pSesInfo->userName, volume_info->username,
2430 MAX_USERNAME_SIZE);
2431 if (volume_info->domainname) {
2432 int len = strlen(volume_info->domainname);
2433 pSesInfo->domainName = kmalloc(len + 1, GFP_KERNEL);
2434 if (pSesInfo->domainName)
2435 strcpy(pSesInfo->domainName,
2436 volume_info->domainname);
2437 }
2438 pSesInfo->linux_uid = volume_info->linux_uid;
2439 pSesInfo->overrideSecFlg = volume_info->secFlg;
2440 mutex_lock(&pSesInfo->session_mutex);
2441
2442 /* BB FIXME need to pass vol->secFlgs BB */
2443 rc = cifs_setup_session(xid, pSesInfo,
2444 cifs_sb->local_nls);
2445 mutex_unlock(&pSesInfo->session_mutex);
2446 } 2557 }
2447 2558
2448 /* search for existing tcon to this server share */ 2559 setup_cifs_sb(volume_info, cifs_sb);
2449 if (!rc) { 2560 if (pSesInfo->capabilities & CAP_LARGE_FILES)
2450 setup_cifs_sb(volume_info, cifs_sb); 2561 sb->s_maxbytes = MAX_LFS_FILESIZE;
2451 2562 else
2452 tcon = cifs_find_tcon(pSesInfo, volume_info->UNC); 2563 sb->s_maxbytes = MAX_NON_LFS;
2453 if (tcon) {
2454 cFYI(1, ("Found match on UNC path"));
2455 /* existing tcon already has a reference */
2456 cifs_put_smb_ses(pSesInfo);
2457 if (tcon->seal != volume_info->seal)
2458 cERROR(1, ("transport encryption setting "
2459 "conflicts with existing tid"));
2460 } else {
2461 tcon = tconInfoAlloc();
2462 if (tcon == NULL) {
2463 rc = -ENOMEM;
2464 goto mount_fail_check;
2465 }
2466
2467 tcon->ses = pSesInfo;
2468 if (volume_info->password) {
2469 tcon->password = kstrdup(volume_info->password,
2470 GFP_KERNEL);
2471 if (!tcon->password) {
2472 rc = -ENOMEM;
2473 goto mount_fail_check;
2474 }
2475 }
2476
2477 if ((strchr(volume_info->UNC + 3, '\\') == NULL)
2478 && (strchr(volume_info->UNC + 3, '/') == NULL)) {
2479 cERROR(1, ("Missing share name"));
2480 rc = -ENODEV;
2481 goto mount_fail_check;
2482 } else {
2483 /* BB Do we need to wrap sesSem around
2484 * this TCon call and Unix SetFS as
2485 * we do on SessSetup and reconnect? */
2486 rc = CIFSTCon(xid, pSesInfo, volume_info->UNC,
2487 tcon, cifs_sb->local_nls);
2488 cFYI(1, ("CIFS Tcon rc = %d", rc));
2489 if (volume_info->nodfs) {
2490 tcon->Flags &= ~SMB_SHARE_IS_IN_DFS;
2491 cFYI(1, ("DFS disabled (%d)",
2492 tcon->Flags));
2493 }
2494 }
2495 if (rc)
2496 goto remote_path_check;
2497 tcon->seal = volume_info->seal;
2498 write_lock(&cifs_tcp_ses_lock);
2499 list_add(&tcon->tcon_list, &pSesInfo->tcon_list);
2500 write_unlock(&cifs_tcp_ses_lock);
2501 }
2502
2503 /* we can have only one retry value for a connection
2504 to a share so for resources mounted more than once
2505 to the same server share the last value passed in
2506 for the retry flag is used */
2507 tcon->retry = volume_info->retry;
2508 tcon->nocase = volume_info->nocase;
2509 tcon->local_lease = volume_info->local_lease;
2510 }
2511 if (pSesInfo) {
2512 if (pSesInfo->capabilities & CAP_LARGE_FILES)
2513 sb->s_maxbytes = MAX_LFS_FILESIZE;
2514 else
2515 sb->s_maxbytes = MAX_NON_LFS;
2516 }
2517 2564
2518 /* BB FIXME fix time_gran to be larger for LANMAN sessions */ 2565 /* BB FIXME fix time_gran to be larger for LANMAN sessions */
2519 sb->s_time_gran = 100; 2566 sb->s_time_gran = 100;
2520 2567
2521 if (rc) 2568 /* search for existing tcon to this server share */
2569 tcon = cifs_get_tcon(pSesInfo, volume_info);
2570 if (IS_ERR(tcon)) {
2571 rc = PTR_ERR(tcon);
2572 tcon = NULL;
2522 goto remote_path_check; 2573 goto remote_path_check;
2574 }
2523 2575
2524 cifs_sb->tcon = tcon; 2576 cifs_sb->tcon = tcon;
2525 2577
@@ -2543,7 +2595,7 @@ try_mount_again:
2543 2595
2544 if ((tcon->unix_ext == 0) && (cifs_sb->rsize > (1024 * 127))) { 2596 if ((tcon->unix_ext == 0) && (cifs_sb->rsize > (1024 * 127))) {
2545 cifs_sb->rsize = 1024 * 127; 2597 cifs_sb->rsize = 1024 * 127;
2546 cFYI(DBG2, ("no very large read support, rsize now 127K")); 2598 cFYI(DBG2, "no very large read support, rsize now 127K");
2547 } 2599 }
2548 if (!(tcon->ses->capabilities & CAP_LARGE_WRITE_X)) 2600 if (!(tcon->ses->capabilities & CAP_LARGE_WRITE_X))
2549 cifs_sb->wsize = min(cifs_sb->wsize, 2601 cifs_sb->wsize = min(cifs_sb->wsize,
@@ -2592,7 +2644,7 @@ remote_path_check:
2592 goto mount_fail_check; 2644 goto mount_fail_check;
2593 } 2645 }
2594 2646
2595 cFYI(1, ("Getting referral for: %s", full_path)); 2647 cFYI(1, "Getting referral for: %s", full_path);
2596 rc = get_dfs_path(xid, pSesInfo , full_path + 1, 2648 rc = get_dfs_path(xid, pSesInfo , full_path + 1,
2597 cifs_sb->local_nls, &num_referrals, &referrals, 2649 cifs_sb->local_nls, &num_referrals, &referrals,
2598 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 2650 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -2706,7 +2758,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
2706 by Samba (not sure whether other servers allow 2758 by Samba (not sure whether other servers allow
2707 NTLMv2 password here) */ 2759 NTLMv2 password here) */
2708#ifdef CONFIG_CIFS_WEAK_PW_HASH 2760#ifdef CONFIG_CIFS_WEAK_PW_HASH
2709 if ((extended_security & CIFSSEC_MAY_LANMAN) && 2761 if ((global_secflags & CIFSSEC_MAY_LANMAN) &&
2710 (ses->server->secType == LANMAN)) 2762 (ses->server->secType == LANMAN))
2711 calc_lanman_hash(tcon->password, ses->server->cryptKey, 2763 calc_lanman_hash(tcon->password, ses->server->cryptKey,
2712 ses->server->secMode & 2764 ses->server->secMode &
@@ -2777,13 +2829,13 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
2777 if (length == 3) { 2829 if (length == 3) {
2778 if ((bcc_ptr[0] == 'I') && (bcc_ptr[1] == 'P') && 2830 if ((bcc_ptr[0] == 'I') && (bcc_ptr[1] == 'P') &&
2779 (bcc_ptr[2] == 'C')) { 2831 (bcc_ptr[2] == 'C')) {
2780 cFYI(1, ("IPC connection")); 2832 cFYI(1, "IPC connection");
2781 tcon->ipc = 1; 2833 tcon->ipc = 1;
2782 } 2834 }
2783 } else if (length == 2) { 2835 } else if (length == 2) {
2784 if ((bcc_ptr[0] == 'A') && (bcc_ptr[1] == ':')) { 2836 if ((bcc_ptr[0] == 'A') && (bcc_ptr[1] == ':')) {
2785 /* the most common case */ 2837 /* the most common case */
2786 cFYI(1, ("disk share connection")); 2838 cFYI(1, "disk share connection");
2787 } 2839 }
2788 } 2840 }
2789 bcc_ptr += length + 1; 2841 bcc_ptr += length + 1;
@@ -2796,7 +2848,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
2796 bytes_left, is_unicode, 2848 bytes_left, is_unicode,
2797 nls_codepage); 2849 nls_codepage);
2798 2850
2799 cFYI(1, ("nativeFileSystem=%s", tcon->nativeFileSystem)); 2851 cFYI(1, "nativeFileSystem=%s", tcon->nativeFileSystem);
2800 2852
2801 if ((smb_buffer_response->WordCount == 3) || 2853 if ((smb_buffer_response->WordCount == 3) ||
2802 (smb_buffer_response->WordCount == 7)) 2854 (smb_buffer_response->WordCount == 7))
@@ -2804,7 +2856,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
2804 tcon->Flags = le16_to_cpu(pSMBr->OptionalSupport); 2856 tcon->Flags = le16_to_cpu(pSMBr->OptionalSupport);
2805 else 2857 else
2806 tcon->Flags = 0; 2858 tcon->Flags = 0;
2807 cFYI(1, ("Tcon flags: 0x%x ", tcon->Flags)); 2859 cFYI(1, "Tcon flags: 0x%x ", tcon->Flags);
2808 } else if ((rc == 0) && tcon == NULL) { 2860 } else if ((rc == 0) && tcon == NULL) {
2809 /* all we need to save for IPC$ connection */ 2861 /* all we need to save for IPC$ connection */
2810 ses->ipc_tid = smb_buffer_response->Tid; 2862 ses->ipc_tid = smb_buffer_response->Tid;
@@ -2832,57 +2884,61 @@ cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb)
2832 return rc; 2884 return rc;
2833} 2885}
2834 2886
2835int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo, 2887int cifs_negotiate_protocol(unsigned int xid, struct cifsSesInfo *ses)
2836 struct nls_table *nls_info)
2837{ 2888{
2838 int rc = 0; 2889 int rc = 0;
2839 int first_time = 0; 2890 struct TCP_Server_Info *server = ses->server;
2840 struct TCP_Server_Info *server = pSesInfo->server; 2891
2841 2892 /* only send once per connect */
2842 /* what if server changes its buffer size after dropping the session? */ 2893 if (server->maxBuf != 0)
2843 if (server->maxBuf == 0) /* no need to send on reconnect */ { 2894 return 0;
2844 rc = CIFSSMBNegotiate(xid, pSesInfo); 2895
2845 if (rc == -EAGAIN) { 2896 rc = CIFSSMBNegotiate(xid, ses);
2846 /* retry only once on 1st time connection */ 2897 if (rc == -EAGAIN) {
2847 rc = CIFSSMBNegotiate(xid, pSesInfo); 2898 /* retry only once on 1st time connection */
2848 if (rc == -EAGAIN) 2899 rc = CIFSSMBNegotiate(xid, ses);
2849 rc = -EHOSTDOWN; 2900 if (rc == -EAGAIN)
2850 } 2901 rc = -EHOSTDOWN;
2851 if (rc == 0) { 2902 }
2852 spin_lock(&GlobalMid_Lock); 2903 if (rc == 0) {
2853 if (server->tcpStatus != CifsExiting) 2904 spin_lock(&GlobalMid_Lock);
2854 server->tcpStatus = CifsGood; 2905 if (server->tcpStatus != CifsExiting)
2855 else 2906 server->tcpStatus = CifsGood;
2856 rc = -EHOSTDOWN; 2907 else
2857 spin_unlock(&GlobalMid_Lock); 2908 rc = -EHOSTDOWN;
2909 spin_unlock(&GlobalMid_Lock);
2858 2910
2859 }
2860 first_time = 1;
2861 } 2911 }
2862 2912
2863 if (rc) 2913 return rc;
2864 goto ss_err_exit; 2914}
2915
2916
2917int cifs_setup_session(unsigned int xid, struct cifsSesInfo *ses,
2918 struct nls_table *nls_info)
2919{
2920 int rc = 0;
2921 struct TCP_Server_Info *server = ses->server;
2865 2922
2866 pSesInfo->flags = 0; 2923 ses->flags = 0;
2867 pSesInfo->capabilities = server->capabilities; 2924 ses->capabilities = server->capabilities;
2868 if (linuxExtEnabled == 0) 2925 if (linuxExtEnabled == 0)
2869 pSesInfo->capabilities &= (~CAP_UNIX); 2926 ses->capabilities &= (~CAP_UNIX);
2870 2927
2871 cFYI(1, ("Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d", 2928 cFYI(1, "Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d",
2872 server->secMode, server->capabilities, server->timeAdj)); 2929 server->secMode, server->capabilities, server->timeAdj);
2873 2930
2874 rc = CIFS_SessSetup(xid, pSesInfo, first_time, nls_info); 2931 rc = CIFS_SessSetup(xid, ses, nls_info);
2875 if (rc) { 2932 if (rc) {
2876 cERROR(1, ("Send error in SessSetup = %d", rc)); 2933 cERROR(1, "Send error in SessSetup = %d", rc);
2877 } else { 2934 } else {
2878 cFYI(1, ("CIFS Session Established successfully")); 2935 cFYI(1, "CIFS Session Established successfully");
2879 spin_lock(&GlobalMid_Lock); 2936 spin_lock(&GlobalMid_Lock);
2880 pSesInfo->status = CifsGood; 2937 ses->status = CifsGood;
2881 pSesInfo->need_reconnect = false; 2938 ses->need_reconnect = false;
2882 spin_unlock(&GlobalMid_Lock); 2939 spin_unlock(&GlobalMid_Lock);
2883 } 2940 }
2884 2941
2885ss_err_exit:
2886 return rc; 2942 return rc;
2887} 2943}
2888 2944
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index e9f7ecc2714b..391816b461ca 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -73,7 +73,7 @@ cifs_bp_rename_retry:
73 namelen += (1 + temp->d_name.len); 73 namelen += (1 + temp->d_name.len);
74 temp = temp->d_parent; 74 temp = temp->d_parent;
75 if (temp == NULL) { 75 if (temp == NULL) {
76 cERROR(1, ("corrupt dentry")); 76 cERROR(1, "corrupt dentry");
77 return NULL; 77 return NULL;
78 } 78 }
79 } 79 }
@@ -90,19 +90,18 @@ cifs_bp_rename_retry:
90 full_path[namelen] = dirsep; 90 full_path[namelen] = dirsep;
91 strncpy(full_path + namelen + 1, temp->d_name.name, 91 strncpy(full_path + namelen + 1, temp->d_name.name,
92 temp->d_name.len); 92 temp->d_name.len);
93 cFYI(0, ("name: %s", full_path + namelen)); 93 cFYI(0, "name: %s", full_path + namelen);
94 } 94 }
95 temp = temp->d_parent; 95 temp = temp->d_parent;
96 if (temp == NULL) { 96 if (temp == NULL) {
97 cERROR(1, ("corrupt dentry")); 97 cERROR(1, "corrupt dentry");
98 kfree(full_path); 98 kfree(full_path);
99 return NULL; 99 return NULL;
100 } 100 }
101 } 101 }
102 if (namelen != pplen + dfsplen) { 102 if (namelen != pplen + dfsplen) {
103 cERROR(1, 103 cERROR(1, "did not end path lookup where expected namelen is %d",
104 ("did not end path lookup where expected namelen is %d", 104 namelen);
105 namelen));
106 /* presumably this is only possible if racing with a rename 105 /* presumably this is only possible if racing with a rename
107 of one of the parent directories (we can not lock the dentries 106 of one of the parent directories (we can not lock the dentries
108 above us to prevent this, but retrying should be harmless) */ 107 above us to prevent this, but retrying should be harmless) */
@@ -130,6 +129,12 @@ cifs_bp_rename_retry:
130 return full_path; 129 return full_path;
131} 130}
132 131
132/*
133 * When called with struct file pointer set to NULL, there is no way we could
134 * update file->private_data, but getting it stuck on openFileList provides a
135 * way to access it from cifs_fill_filedata and thereby set file->private_data
136 * from cifs_open.
137 */
133struct cifsFileInfo * 138struct cifsFileInfo *
134cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle, 139cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle,
135 struct file *file, struct vfsmount *mnt, unsigned int oflags) 140 struct file *file, struct vfsmount *mnt, unsigned int oflags)
@@ -173,7 +178,7 @@ cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle,
173 if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) { 178 if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
174 pCifsInode->clientCanCacheAll = true; 179 pCifsInode->clientCanCacheAll = true;
175 pCifsInode->clientCanCacheRead = true; 180 pCifsInode->clientCanCacheRead = true;
176 cFYI(1, ("Exclusive Oplock inode %p", newinode)); 181 cFYI(1, "Exclusive Oplock inode %p", newinode);
177 } else if ((oplock & 0xF) == OPLOCK_READ) 182 } else if ((oplock & 0xF) == OPLOCK_READ)
178 pCifsInode->clientCanCacheRead = true; 183 pCifsInode->clientCanCacheRead = true;
179 } 184 }
@@ -183,16 +188,17 @@ cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle,
183} 188}
184 189
185int cifs_posix_open(char *full_path, struct inode **pinode, 190int cifs_posix_open(char *full_path, struct inode **pinode,
186 struct vfsmount *mnt, int mode, int oflags, 191 struct vfsmount *mnt, struct super_block *sb,
187 __u32 *poplock, __u16 *pnetfid, int xid) 192 int mode, int oflags,
193 __u32 *poplock, __u16 *pnetfid, int xid)
188{ 194{
189 int rc; 195 int rc;
190 FILE_UNIX_BASIC_INFO *presp_data; 196 FILE_UNIX_BASIC_INFO *presp_data;
191 __u32 posix_flags = 0; 197 __u32 posix_flags = 0;
192 struct cifs_sb_info *cifs_sb = CIFS_SB(mnt->mnt_sb); 198 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
193 struct cifs_fattr fattr; 199 struct cifs_fattr fattr;
194 200
195 cFYI(1, ("posix open %s", full_path)); 201 cFYI(1, "posix open %s", full_path);
196 202
197 presp_data = kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL); 203 presp_data = kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL);
198 if (presp_data == NULL) 204 if (presp_data == NULL)
@@ -242,7 +248,8 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
242 248
243 /* get new inode and set it up */ 249 /* get new inode and set it up */
244 if (*pinode == NULL) { 250 if (*pinode == NULL) {
245 *pinode = cifs_iget(mnt->mnt_sb, &fattr); 251 cifs_fill_uniqueid(sb, &fattr);
252 *pinode = cifs_iget(sb, &fattr);
246 if (!*pinode) { 253 if (!*pinode) {
247 rc = -ENOMEM; 254 rc = -ENOMEM;
248 goto posix_open_ret; 255 goto posix_open_ret;
@@ -251,7 +258,18 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
251 cifs_fattr_to_inode(*pinode, &fattr); 258 cifs_fattr_to_inode(*pinode, &fattr);
252 } 259 }
253 260
254 cifs_new_fileinfo(*pinode, *pnetfid, NULL, mnt, oflags); 261 /*
262 * cifs_fill_filedata() takes care of setting cifsFileInfo pointer to
263 * file->private_data.
264 */
265 if (mnt) {
266 struct cifsFileInfo *pfile_info;
267
268 pfile_info = cifs_new_fileinfo(*pinode, *pnetfid, NULL, mnt,
269 oflags);
270 if (pfile_info == NULL)
271 rc = -ENOMEM;
272 }
255 273
256posix_open_ret: 274posix_open_ret:
257 kfree(presp_data); 275 kfree(presp_data);
@@ -315,13 +333,14 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
315 if (nd && (nd->flags & LOOKUP_OPEN)) 333 if (nd && (nd->flags & LOOKUP_OPEN))
316 oflags = nd->intent.open.flags; 334 oflags = nd->intent.open.flags;
317 else 335 else
318 oflags = FMODE_READ; 336 oflags = FMODE_READ | SMB_O_CREAT;
319 337
320 if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) && 338 if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) &&
321 (CIFS_UNIX_POSIX_PATH_OPS_CAP & 339 (CIFS_UNIX_POSIX_PATH_OPS_CAP &
322 le64_to_cpu(tcon->fsUnixInfo.Capability))) { 340 le64_to_cpu(tcon->fsUnixInfo.Capability))) {
323 rc = cifs_posix_open(full_path, &newinode, nd->path.mnt, 341 rc = cifs_posix_open(full_path, &newinode,
324 mode, oflags, &oplock, &fileHandle, xid); 342 nd ? nd->path.mnt : NULL,
343 inode->i_sb, mode, oflags, &oplock, &fileHandle, xid);
325 /* EIO could indicate that (posix open) operation is not 344 /* EIO could indicate that (posix open) operation is not
326 supported, despite what server claimed in capability 345 supported, despite what server claimed in capability
327 negotation. EREMOTE indicates DFS junction, which is not 346 negotation. EREMOTE indicates DFS junction, which is not
@@ -358,7 +377,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
358 else if ((oflags & O_CREAT) == O_CREAT) 377 else if ((oflags & O_CREAT) == O_CREAT)
359 disposition = FILE_OPEN_IF; 378 disposition = FILE_OPEN_IF;
360 else 379 else
361 cFYI(1, ("Create flag not set in create function")); 380 cFYI(1, "Create flag not set in create function");
362 } 381 }
363 382
364 /* BB add processing to set equivalent of mode - e.g. via CreateX with 383 /* BB add processing to set equivalent of mode - e.g. via CreateX with
@@ -394,7 +413,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
394 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 413 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
395 } 414 }
396 if (rc) { 415 if (rc) {
397 cFYI(1, ("cifs_create returned 0x%x", rc)); 416 cFYI(1, "cifs_create returned 0x%x", rc);
398 goto cifs_create_out; 417 goto cifs_create_out;
399 } 418 }
400 419
@@ -457,15 +476,22 @@ cifs_create_set_dentry:
457 if (rc == 0) 476 if (rc == 0)
458 setup_cifs_dentry(tcon, direntry, newinode); 477 setup_cifs_dentry(tcon, direntry, newinode);
459 else 478 else
460 cFYI(1, ("Create worked, get_inode_info failed rc = %d", rc)); 479 cFYI(1, "Create worked, get_inode_info failed rc = %d", rc);
461 480
462 /* nfsd case - nfs srv does not set nd */ 481 /* nfsd case - nfs srv does not set nd */
463 if ((nd == NULL) || (!(nd->flags & LOOKUP_OPEN))) { 482 if ((nd == NULL) || (!(nd->flags & LOOKUP_OPEN))) {
464 /* mknod case - do not leave file open */ 483 /* mknod case - do not leave file open */
465 CIFSSMBClose(xid, tcon, fileHandle); 484 CIFSSMBClose(xid, tcon, fileHandle);
466 } else if (!(posix_create) && (newinode)) { 485 } else if (!(posix_create) && (newinode)) {
467 cifs_new_fileinfo(newinode, fileHandle, NULL, 486 struct cifsFileInfo *pfile_info;
468 nd->path.mnt, oflags); 487 /*
488 * cifs_fill_filedata() takes care of setting cifsFileInfo
489 * pointer to file->private_data.
490 */
491 pfile_info = cifs_new_fileinfo(newinode, fileHandle, NULL,
492 nd->path.mnt, oflags);
493 if (pfile_info == NULL)
494 rc = -ENOMEM;
469 } 495 }
470cifs_create_out: 496cifs_create_out:
471 kfree(buf); 497 kfree(buf);
@@ -531,7 +557,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
531 u16 fileHandle; 557 u16 fileHandle;
532 FILE_ALL_INFO *buf; 558 FILE_ALL_INFO *buf;
533 559
534 cFYI(1, ("sfu compat create special file")); 560 cFYI(1, "sfu compat create special file");
535 561
536 buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); 562 buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
537 if (buf == NULL) { 563 if (buf == NULL) {
@@ -616,8 +642,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
616 642
617 xid = GetXid(); 643 xid = GetXid();
618 644
619 cFYI(1, ("parent inode = 0x%p name is: %s and dentry = 0x%p", 645 cFYI(1, "parent inode = 0x%p name is: %s and dentry = 0x%p",
620 parent_dir_inode, direntry->d_name.name, direntry)); 646 parent_dir_inode, direntry->d_name.name, direntry);
621 647
622 /* check whether path exists */ 648 /* check whether path exists */
623 649
@@ -632,7 +658,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
632 int i; 658 int i;
633 for (i = 0; i < direntry->d_name.len; i++) 659 for (i = 0; i < direntry->d_name.len; i++)
634 if (direntry->d_name.name[i] == '\\') { 660 if (direntry->d_name.name[i] == '\\') {
635 cFYI(1, ("Invalid file name")); 661 cFYI(1, "Invalid file name");
636 FreeXid(xid); 662 FreeXid(xid);
637 return ERR_PTR(-EINVAL); 663 return ERR_PTR(-EINVAL);
638 } 664 }
@@ -657,11 +683,11 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
657 } 683 }
658 684
659 if (direntry->d_inode != NULL) { 685 if (direntry->d_inode != NULL) {
660 cFYI(1, ("non-NULL inode in lookup")); 686 cFYI(1, "non-NULL inode in lookup");
661 } else { 687 } else {
662 cFYI(1, ("NULL inode in lookup")); 688 cFYI(1, "NULL inode in lookup");
663 } 689 }
664 cFYI(1, ("Full path: %s inode = 0x%p", full_path, direntry->d_inode)); 690 cFYI(1, "Full path: %s inode = 0x%p", full_path, direntry->d_inode);
665 691
666 /* Posix open is only called (at lookup time) for file create now. 692 /* Posix open is only called (at lookup time) for file create now.
667 * For opens (rather than creates), because we do not know if it 693 * For opens (rather than creates), because we do not know if it
@@ -678,6 +704,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
678 (nd->flags & LOOKUP_OPEN) && !pTcon->broken_posix_open && 704 (nd->flags & LOOKUP_OPEN) && !pTcon->broken_posix_open &&
679 (nd->intent.open.flags & O_CREAT)) { 705 (nd->intent.open.flags & O_CREAT)) {
680 rc = cifs_posix_open(full_path, &newInode, nd->path.mnt, 706 rc = cifs_posix_open(full_path, &newInode, nd->path.mnt,
707 parent_dir_inode->i_sb,
681 nd->intent.open.create_mode, 708 nd->intent.open.create_mode,
682 nd->intent.open.flags, &oplock, 709 nd->intent.open.flags, &oplock,
683 &fileHandle, xid); 710 &fileHandle, xid);
@@ -723,7 +750,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
723 /* if it was once a directory (but how can we tell?) we could do 750 /* if it was once a directory (but how can we tell?) we could do
724 shrink_dcache_parent(direntry); */ 751 shrink_dcache_parent(direntry); */
725 } else if (rc != -EACCES) { 752 } else if (rc != -EACCES) {
726 cERROR(1, ("Unexpected lookup error %d", rc)); 753 cERROR(1, "Unexpected lookup error %d", rc);
727 /* We special case check for Access Denied - since that 754 /* We special case check for Access Denied - since that
728 is a common return code */ 755 is a common return code */
729 } 756 }
@@ -742,8 +769,8 @@ cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd)
742 if (cifs_revalidate_dentry(direntry)) 769 if (cifs_revalidate_dentry(direntry))
743 return 0; 770 return 0;
744 } else { 771 } else {
745 cFYI(1, ("neg dentry 0x%p name = %s", 772 cFYI(1, "neg dentry 0x%p name = %s",
746 direntry, direntry->d_name.name)); 773 direntry, direntry->d_name.name);
747 if (time_after(jiffies, direntry->d_time + HZ) || 774 if (time_after(jiffies, direntry->d_time + HZ) ||
748 !lookupCacheEnabled) { 775 !lookupCacheEnabled) {
749 d_drop(direntry); 776 d_drop(direntry);
@@ -758,7 +785,7 @@ cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd)
758{ 785{
759 int rc = 0; 786 int rc = 0;
760 787
761 cFYI(1, ("In cifs d_delete, name = %s", direntry->d_name.name)); 788 cFYI(1, "In cifs d_delete, name = %s", direntry->d_name.name);
762 789
763 return rc; 790 return rc;
764} */ 791} */
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index 87948147d7ec..4db2c5e7283f 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -23,6 +23,7 @@
23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 */ 24 */
25 25
26#include <linux/slab.h>
26#include <keys/user-type.h> 27#include <keys/user-type.h>
27#include "dns_resolve.h" 28#include "dns_resolve.h"
28#include "cifsglob.h" 29#include "cifsglob.h"
@@ -105,14 +106,14 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
105 /* search for server name delimiter */ 106 /* search for server name delimiter */
106 len = strlen(unc); 107 len = strlen(unc);
107 if (len < 3) { 108 if (len < 3) {
108 cFYI(1, ("%s: unc is too short: %s", __func__, unc)); 109 cFYI(1, "%s: unc is too short: %s", __func__, unc);
109 return -EINVAL; 110 return -EINVAL;
110 } 111 }
111 len -= 2; 112 len -= 2;
112 name = memchr(unc+2, '\\', len); 113 name = memchr(unc+2, '\\', len);
113 if (!name) { 114 if (!name) {
114 cFYI(1, ("%s: probably server name is whole unc: %s", 115 cFYI(1, "%s: probably server name is whole unc: %s",
115 __func__, unc)); 116 __func__, unc);
116 } else { 117 } else {
117 len = (name - unc) - 2/* leading // */; 118 len = (name - unc) - 2/* leading // */;
118 } 119 }
@@ -126,8 +127,8 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
126 name[len] = 0; 127 name[len] = 0;
127 128
128 if (is_ip(name)) { 129 if (is_ip(name)) {
129 cFYI(1, ("%s: it is IP, skipping dns upcall: %s", 130 cFYI(1, "%s: it is IP, skipping dns upcall: %s",
130 __func__, name)); 131 __func__, name);
131 data = name; 132 data = name;
132 goto skip_upcall; 133 goto skip_upcall;
133 } 134 }
@@ -137,7 +138,7 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
137 len = rkey->type_data.x[0]; 138 len = rkey->type_data.x[0];
138 data = rkey->payload.data; 139 data = rkey->payload.data;
139 } else { 140 } else {
140 cERROR(1, ("%s: unable to resolve: %s", __func__, name)); 141 cERROR(1, "%s: unable to resolve: %s", __func__, name);
141 goto out; 142 goto out;
142 } 143 }
143 144
@@ -147,10 +148,10 @@ skip_upcall:
147 if (*ip_addr) { 148 if (*ip_addr) {
148 memcpy(*ip_addr, data, len + 1); 149 memcpy(*ip_addr, data, len + 1);
149 if (!IS_ERR(rkey)) 150 if (!IS_ERR(rkey))
150 cFYI(1, ("%s: resolved: %s to %s", __func__, 151 cFYI(1, "%s: resolved: %s to %s", __func__,
151 name, 152 name,
152 *ip_addr 153 *ip_addr
153 )); 154 );
154 rc = 0; 155 rc = 0;
155 } else { 156 } else {
156 rc = -ENOMEM; 157 rc = -ENOMEM;
diff --git a/fs/cifs/export.c b/fs/cifs/export.c
index 6177f7cca16a..993f82045bf6 100644
--- a/fs/cifs/export.c
+++ b/fs/cifs/export.c
@@ -49,7 +49,7 @@
49static struct dentry *cifs_get_parent(struct dentry *dentry) 49static struct dentry *cifs_get_parent(struct dentry *dentry)
50{ 50{
51 /* BB need to add code here eventually to enable export via NFSD */ 51 /* BB need to add code here eventually to enable export via NFSD */
52 cFYI(1, ("get parent for %p", dentry)); 52 cFYI(1, "get parent for %p", dentry);
53 return ERR_PTR(-EACCES); 53 return ERR_PTR(-EACCES);
54} 54}
55 55
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index ca2ba7a0193c..f1ff785b2292 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -3,7 +3,7 @@
3 * 3 *
4 * vfs operations that deal with files 4 * vfs operations that deal with files
5 * 5 *
6 * Copyright (C) International Business Machines Corp., 2002,2007 6 * Copyright (C) International Business Machines Corp., 2002,2010
7 * Author(s): Steve French (sfrench@us.ibm.com) 7 * Author(s): Steve French (sfrench@us.ibm.com)
8 * Jeremy Allison (jra@samba.org) 8 * Jeremy Allison (jra@samba.org)
9 * 9 *
@@ -31,6 +31,7 @@
31#include <linux/task_io_accounting_ops.h> 31#include <linux/task_io_accounting_ops.h>
32#include <linux/delay.h> 32#include <linux/delay.h>
33#include <linux/mount.h> 33#include <linux/mount.h>
34#include <linux/slab.h>
34#include <asm/div64.h> 35#include <asm/div64.h>
35#include "cifsfs.h" 36#include "cifsfs.h"
36#include "cifspdu.h" 37#include "cifspdu.h"
@@ -107,8 +108,7 @@ static inline int cifs_get_disposition(unsigned int flags)
107/* all arguments to this function must be checked for validity in caller */ 108/* all arguments to this function must be checked for validity in caller */
108static inline int 109static inline int
109cifs_posix_open_inode_helper(struct inode *inode, struct file *file, 110cifs_posix_open_inode_helper(struct inode *inode, struct file *file,
110 struct cifsInodeInfo *pCifsInode, 111 struct cifsInodeInfo *pCifsInode, __u32 oplock,
111 struct cifsFileInfo *pCifsFile, __u32 oplock,
112 u16 netfid) 112 u16 netfid)
113{ 113{
114 114
@@ -135,15 +135,15 @@ cifs_posix_open_inode_helper(struct inode *inode, struct file *file,
135 if (timespec_equal(&file->f_path.dentry->d_inode->i_mtime, &temp) && 135 if (timespec_equal(&file->f_path.dentry->d_inode->i_mtime, &temp) &&
136 (file->f_path.dentry->d_inode->i_size == 136 (file->f_path.dentry->d_inode->i_size ==
137 (loff_t)le64_to_cpu(buf->EndOfFile))) { 137 (loff_t)le64_to_cpu(buf->EndOfFile))) {
138 cFYI(1, ("inode unchanged on server")); 138 cFYI(1, "inode unchanged on server");
139 } else { 139 } else {
140 if (file->f_path.dentry->d_inode->i_mapping) { 140 if (file->f_path.dentry->d_inode->i_mapping) {
141 rc = filemap_write_and_wait(file->f_path.dentry->d_inode->i_mapping); 141 rc = filemap_write_and_wait(file->f_path.dentry->d_inode->i_mapping);
142 if (rc != 0) 142 if (rc != 0)
143 CIFS_I(file->f_path.dentry->d_inode)->write_behind_rc = rc; 143 CIFS_I(file->f_path.dentry->d_inode)->write_behind_rc = rc;
144 } 144 }
145 cFYI(1, ("invalidating remote inode since open detected it " 145 cFYI(1, "invalidating remote inode since open detected it "
146 "changed")); 146 "changed");
147 invalidate_remote_inode(file->f_path.dentry->d_inode); 147 invalidate_remote_inode(file->f_path.dentry->d_inode);
148 } */ 148 } */
149 149
@@ -151,8 +151,8 @@ psx_client_can_cache:
151 if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) { 151 if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
152 pCifsInode->clientCanCacheAll = true; 152 pCifsInode->clientCanCacheAll = true;
153 pCifsInode->clientCanCacheRead = true; 153 pCifsInode->clientCanCacheRead = true;
154 cFYI(1, ("Exclusive Oplock granted on inode %p", 154 cFYI(1, "Exclusive Oplock granted on inode %p",
155 file->f_path.dentry->d_inode)); 155 file->f_path.dentry->d_inode);
156 } else if ((oplock & 0xF) == OPLOCK_READ) 156 } else if ((oplock & 0xF) == OPLOCK_READ)
157 pCifsInode->clientCanCacheRead = true; 157 pCifsInode->clientCanCacheRead = true;
158 158
@@ -189,8 +189,8 @@ cifs_fill_filedata(struct file *file)
189 if (file->private_data != NULL) { 189 if (file->private_data != NULL) {
190 return pCifsFile; 190 return pCifsFile;
191 } else if ((file->f_flags & O_CREAT) && (file->f_flags & O_EXCL)) 191 } else if ((file->f_flags & O_CREAT) && (file->f_flags & O_EXCL))
192 cERROR(1, ("could not find file instance for " 192 cERROR(1, "could not find file instance for "
193 "new file %p", file)); 193 "new file %p", file);
194 return NULL; 194 return NULL;
195} 195}
196 196
@@ -216,7 +216,7 @@ static inline int cifs_open_inode_helper(struct inode *inode, struct file *file,
216 if (timespec_equal(&file->f_path.dentry->d_inode->i_mtime, &temp) && 216 if (timespec_equal(&file->f_path.dentry->d_inode->i_mtime, &temp) &&
217 (file->f_path.dentry->d_inode->i_size == 217 (file->f_path.dentry->d_inode->i_size ==
218 (loff_t)le64_to_cpu(buf->EndOfFile))) { 218 (loff_t)le64_to_cpu(buf->EndOfFile))) {
219 cFYI(1, ("inode unchanged on server")); 219 cFYI(1, "inode unchanged on server");
220 } else { 220 } else {
221 if (file->f_path.dentry->d_inode->i_mapping) { 221 if (file->f_path.dentry->d_inode->i_mapping) {
222 /* BB no need to lock inode until after invalidate 222 /* BB no need to lock inode until after invalidate
@@ -225,8 +225,8 @@ static inline int cifs_open_inode_helper(struct inode *inode, struct file *file,
225 if (rc != 0) 225 if (rc != 0)
226 CIFS_I(file->f_path.dentry->d_inode)->write_behind_rc = rc; 226 CIFS_I(file->f_path.dentry->d_inode)->write_behind_rc = rc;
227 } 227 }
228 cFYI(1, ("invalidating remote inode since open detected it " 228 cFYI(1, "invalidating remote inode since open detected it "
229 "changed")); 229 "changed");
230 invalidate_remote_inode(file->f_path.dentry->d_inode); 230 invalidate_remote_inode(file->f_path.dentry->d_inode);
231 } 231 }
232 232
@@ -241,8 +241,8 @@ client_can_cache:
241 if ((*oplock & 0xF) == OPLOCK_EXCLUSIVE) { 241 if ((*oplock & 0xF) == OPLOCK_EXCLUSIVE) {
242 pCifsInode->clientCanCacheAll = true; 242 pCifsInode->clientCanCacheAll = true;
243 pCifsInode->clientCanCacheRead = true; 243 pCifsInode->clientCanCacheRead = true;
244 cFYI(1, ("Exclusive Oplock granted on inode %p", 244 cFYI(1, "Exclusive Oplock granted on inode %p",
245 file->f_path.dentry->d_inode)); 245 file->f_path.dentry->d_inode);
246 } else if ((*oplock & 0xF) == OPLOCK_READ) 246 } else if ((*oplock & 0xF) == OPLOCK_READ)
247 pCifsInode->clientCanCacheRead = true; 247 pCifsInode->clientCanCacheRead = true;
248 248
@@ -284,8 +284,8 @@ int cifs_open(struct inode *inode, struct file *file)
284 return rc; 284 return rc;
285 } 285 }
286 286
287 cFYI(1, ("inode = 0x%p file flags are 0x%x for %s", 287 cFYI(1, "inode = 0x%p file flags are 0x%x for %s",
288 inode, file->f_flags, full_path)); 288 inode, file->f_flags, full_path);
289 289
290 if (oplockEnabled) 290 if (oplockEnabled)
291 oplock = REQ_OPLOCK; 291 oplock = REQ_OPLOCK;
@@ -297,27 +297,29 @@ int cifs_open(struct inode *inode, struct file *file)
297 (CIFS_UNIX_POSIX_PATH_OPS_CAP & 297 (CIFS_UNIX_POSIX_PATH_OPS_CAP &
298 le64_to_cpu(tcon->fsUnixInfo.Capability))) { 298 le64_to_cpu(tcon->fsUnixInfo.Capability))) {
299 int oflags = (int) cifs_posix_convert_flags(file->f_flags); 299 int oflags = (int) cifs_posix_convert_flags(file->f_flags);
300 oflags |= SMB_O_CREAT;
300 /* can not refresh inode info since size could be stale */ 301 /* can not refresh inode info since size could be stale */
301 rc = cifs_posix_open(full_path, &inode, file->f_path.mnt, 302 rc = cifs_posix_open(full_path, &inode, file->f_path.mnt,
302 cifs_sb->mnt_file_mode /* ignored */, 303 inode->i_sb,
303 oflags, &oplock, &netfid, xid); 304 cifs_sb->mnt_file_mode /* ignored */,
305 oflags, &oplock, &netfid, xid);
304 if (rc == 0) { 306 if (rc == 0) {
305 cFYI(1, ("posix open succeeded")); 307 cFYI(1, "posix open succeeded");
306 /* no need for special case handling of setting mode 308 /* no need for special case handling of setting mode
307 on read only files needed here */ 309 on read only files needed here */
308 310
309 pCifsFile = cifs_fill_filedata(file); 311 pCifsFile = cifs_fill_filedata(file);
310 cifs_posix_open_inode_helper(inode, file, pCifsInode, 312 cifs_posix_open_inode_helper(inode, file, pCifsInode,
311 pCifsFile, oplock, netfid); 313 oplock, netfid);
312 goto out; 314 goto out;
313 } else if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) { 315 } else if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
314 if (tcon->ses->serverNOS) 316 if (tcon->ses->serverNOS)
315 cERROR(1, ("server %s of type %s returned" 317 cERROR(1, "server %s of type %s returned"
316 " unexpected error on SMB posix open" 318 " unexpected error on SMB posix open"
317 ", disabling posix open support." 319 ", disabling posix open support."
318 " Check if server update available.", 320 " Check if server update available.",
319 tcon->ses->serverName, 321 tcon->ses->serverName,
320 tcon->ses->serverNOS)); 322 tcon->ses->serverNOS);
321 tcon->broken_posix_open = true; 323 tcon->broken_posix_open = true;
322 } else if ((rc != -EIO) && (rc != -EREMOTE) && 324 } else if ((rc != -EIO) && (rc != -EREMOTE) &&
323 (rc != -EOPNOTSUPP)) /* path not found or net err */ 325 (rc != -EOPNOTSUPP)) /* path not found or net err */
@@ -385,7 +387,7 @@ int cifs_open(struct inode *inode, struct file *file)
385 & CIFS_MOUNT_MAP_SPECIAL_CHR); 387 & CIFS_MOUNT_MAP_SPECIAL_CHR);
386 } 388 }
387 if (rc) { 389 if (rc) {
388 cFYI(1, ("cifs_open returned 0x%x", rc)); 390 cFYI(1, "cifs_open returned 0x%x", rc);
389 goto out; 391 goto out;
390 } 392 }
391 393
@@ -468,7 +470,7 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
468 } 470 }
469 471
470 if (file->f_path.dentry == NULL) { 472 if (file->f_path.dentry == NULL) {
471 cERROR(1, ("no valid name if dentry freed")); 473 cERROR(1, "no valid name if dentry freed");
472 dump_stack(); 474 dump_stack();
473 rc = -EBADF; 475 rc = -EBADF;
474 goto reopen_error_exit; 476 goto reopen_error_exit;
@@ -476,7 +478,7 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
476 478
477 inode = file->f_path.dentry->d_inode; 479 inode = file->f_path.dentry->d_inode;
478 if (inode == NULL) { 480 if (inode == NULL) {
479 cERROR(1, ("inode not valid")); 481 cERROR(1, "inode not valid");
480 dump_stack(); 482 dump_stack();
481 rc = -EBADF; 483 rc = -EBADF;
482 goto reopen_error_exit; 484 goto reopen_error_exit;
@@ -498,8 +500,8 @@ reopen_error_exit:
498 return rc; 500 return rc;
499 } 501 }
500 502
501 cFYI(1, ("inode = 0x%p file flags 0x%x for %s", 503 cFYI(1, "inode = 0x%p file flags 0x%x for %s",
502 inode, file->f_flags, full_path)); 504 inode, file->f_flags, full_path);
503 505
504 if (oplockEnabled) 506 if (oplockEnabled)
505 oplock = REQ_OPLOCK; 507 oplock = REQ_OPLOCK;
@@ -512,10 +514,11 @@ reopen_error_exit:
512 int oflags = (int) cifs_posix_convert_flags(file->f_flags); 514 int oflags = (int) cifs_posix_convert_flags(file->f_flags);
513 /* can not refresh inode info since size could be stale */ 515 /* can not refresh inode info since size could be stale */
514 rc = cifs_posix_open(full_path, NULL, file->f_path.mnt, 516 rc = cifs_posix_open(full_path, NULL, file->f_path.mnt,
515 cifs_sb->mnt_file_mode /* ignored */, 517 inode->i_sb,
516 oflags, &oplock, &netfid, xid); 518 cifs_sb->mnt_file_mode /* ignored */,
519 oflags, &oplock, &netfid, xid);
517 if (rc == 0) { 520 if (rc == 0) {
518 cFYI(1, ("posix reopen succeeded")); 521 cFYI(1, "posix reopen succeeded");
519 goto reopen_success; 522 goto reopen_success;
520 } 523 }
521 /* fallthrough to retry open the old way on errors, especially 524 /* fallthrough to retry open the old way on errors, especially
@@ -536,8 +539,8 @@ reopen_error_exit:
536 CIFS_MOUNT_MAP_SPECIAL_CHR); 539 CIFS_MOUNT_MAP_SPECIAL_CHR);
537 if (rc) { 540 if (rc) {
538 mutex_unlock(&pCifsFile->fh_mutex); 541 mutex_unlock(&pCifsFile->fh_mutex);
539 cFYI(1, ("cifs_open returned 0x%x", rc)); 542 cFYI(1, "cifs_open returned 0x%x", rc);
540 cFYI(1, ("oplock: %d", oplock)); 543 cFYI(1, "oplock: %d", oplock);
541 } else { 544 } else {
542reopen_success: 545reopen_success:
543 pCifsFile->netfid = netfid; 546 pCifsFile->netfid = netfid;
@@ -569,8 +572,8 @@ reopen_success:
569 if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) { 572 if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
570 pCifsInode->clientCanCacheAll = true; 573 pCifsInode->clientCanCacheAll = true;
571 pCifsInode->clientCanCacheRead = true; 574 pCifsInode->clientCanCacheRead = true;
572 cFYI(1, ("Exclusive Oplock granted on inode %p", 575 cFYI(1, "Exclusive Oplock granted on inode %p",
573 file->f_path.dentry->d_inode)); 576 file->f_path.dentry->d_inode);
574 } else if ((oplock & 0xF) == OPLOCK_READ) { 577 } else if ((oplock & 0xF) == OPLOCK_READ) {
575 pCifsInode->clientCanCacheRead = true; 578 pCifsInode->clientCanCacheRead = true;
576 pCifsInode->clientCanCacheAll = false; 579 pCifsInode->clientCanCacheAll = false;
@@ -618,8 +621,7 @@ int cifs_close(struct inode *inode, struct file *file)
618 the struct would be in each open file, 621 the struct would be in each open file,
619 but this should give enough time to 622 but this should give enough time to
620 clear the socket */ 623 clear the socket */
621 cFYI(DBG2, 624 cFYI(DBG2, "close delay, write pending");
622 ("close delay, write pending"));
623 msleep(timeout); 625 msleep(timeout);
624 timeout *= 4; 626 timeout *= 4;
625 } 627 }
@@ -652,7 +654,7 @@ int cifs_close(struct inode *inode, struct file *file)
652 654
653 read_lock(&GlobalSMBSeslock); 655 read_lock(&GlobalSMBSeslock);
654 if (list_empty(&(CIFS_I(inode)->openFileList))) { 656 if (list_empty(&(CIFS_I(inode)->openFileList))) {
655 cFYI(1, ("closing last open instance for inode %p", inode)); 657 cFYI(1, "closing last open instance for inode %p", inode);
656 /* if the file is not open we do not know if we can cache info 658 /* if the file is not open we do not know if we can cache info
657 on this inode, much less write behind and read ahead */ 659 on this inode, much less write behind and read ahead */
658 CIFS_I(inode)->clientCanCacheRead = false; 660 CIFS_I(inode)->clientCanCacheRead = false;
@@ -673,7 +675,7 @@ int cifs_closedir(struct inode *inode, struct file *file)
673 (struct cifsFileInfo *)file->private_data; 675 (struct cifsFileInfo *)file->private_data;
674 char *ptmp; 676 char *ptmp;
675 677
676 cFYI(1, ("Closedir inode = 0x%p", inode)); 678 cFYI(1, "Closedir inode = 0x%p", inode);
677 679
678 xid = GetXid(); 680 xid = GetXid();
679 681
@@ -684,22 +686,22 @@ int cifs_closedir(struct inode *inode, struct file *file)
684 686
685 pTcon = cifs_sb->tcon; 687 pTcon = cifs_sb->tcon;
686 688
687 cFYI(1, ("Freeing private data in close dir")); 689 cFYI(1, "Freeing private data in close dir");
688 write_lock(&GlobalSMBSeslock); 690 write_lock(&GlobalSMBSeslock);
689 if (!pCFileStruct->srch_inf.endOfSearch && 691 if (!pCFileStruct->srch_inf.endOfSearch &&
690 !pCFileStruct->invalidHandle) { 692 !pCFileStruct->invalidHandle) {
691 pCFileStruct->invalidHandle = true; 693 pCFileStruct->invalidHandle = true;
692 write_unlock(&GlobalSMBSeslock); 694 write_unlock(&GlobalSMBSeslock);
693 rc = CIFSFindClose(xid, pTcon, pCFileStruct->netfid); 695 rc = CIFSFindClose(xid, pTcon, pCFileStruct->netfid);
694 cFYI(1, ("Closing uncompleted readdir with rc %d", 696 cFYI(1, "Closing uncompleted readdir with rc %d",
695 rc)); 697 rc);
696 /* not much we can do if it fails anyway, ignore rc */ 698 /* not much we can do if it fails anyway, ignore rc */
697 rc = 0; 699 rc = 0;
698 } else 700 } else
699 write_unlock(&GlobalSMBSeslock); 701 write_unlock(&GlobalSMBSeslock);
700 ptmp = pCFileStruct->srch_inf.ntwrk_buf_start; 702 ptmp = pCFileStruct->srch_inf.ntwrk_buf_start;
701 if (ptmp) { 703 if (ptmp) {
702 cFYI(1, ("closedir free smb buf in srch struct")); 704 cFYI(1, "closedir free smb buf in srch struct");
703 pCFileStruct->srch_inf.ntwrk_buf_start = NULL; 705 pCFileStruct->srch_inf.ntwrk_buf_start = NULL;
704 if (pCFileStruct->srch_inf.smallBuf) 706 if (pCFileStruct->srch_inf.smallBuf)
705 cifs_small_buf_release(ptmp); 707 cifs_small_buf_release(ptmp);
@@ -747,49 +749,49 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
747 rc = -EACCES; 749 rc = -EACCES;
748 xid = GetXid(); 750 xid = GetXid();
749 751
750 cFYI(1, ("Lock parm: 0x%x flockflags: " 752 cFYI(1, "Lock parm: 0x%x flockflags: "
751 "0x%x flocktype: 0x%x start: %lld end: %lld", 753 "0x%x flocktype: 0x%x start: %lld end: %lld",
752 cmd, pfLock->fl_flags, pfLock->fl_type, pfLock->fl_start, 754 cmd, pfLock->fl_flags, pfLock->fl_type, pfLock->fl_start,
753 pfLock->fl_end)); 755 pfLock->fl_end);
754 756
755 if (pfLock->fl_flags & FL_POSIX) 757 if (pfLock->fl_flags & FL_POSIX)
756 cFYI(1, ("Posix")); 758 cFYI(1, "Posix");
757 if (pfLock->fl_flags & FL_FLOCK) 759 if (pfLock->fl_flags & FL_FLOCK)
758 cFYI(1, ("Flock")); 760 cFYI(1, "Flock");
759 if (pfLock->fl_flags & FL_SLEEP) { 761 if (pfLock->fl_flags & FL_SLEEP) {
760 cFYI(1, ("Blocking lock")); 762 cFYI(1, "Blocking lock");
761 wait_flag = true; 763 wait_flag = true;
762 } 764 }
763 if (pfLock->fl_flags & FL_ACCESS) 765 if (pfLock->fl_flags & FL_ACCESS)
764 cFYI(1, ("Process suspended by mandatory locking - " 766 cFYI(1, "Process suspended by mandatory locking - "
765 "not implemented yet")); 767 "not implemented yet");
766 if (pfLock->fl_flags & FL_LEASE) 768 if (pfLock->fl_flags & FL_LEASE)
767 cFYI(1, ("Lease on file - not implemented yet")); 769 cFYI(1, "Lease on file - not implemented yet");
768 if (pfLock->fl_flags & 770 if (pfLock->fl_flags &
769 (~(FL_POSIX | FL_FLOCK | FL_SLEEP | FL_ACCESS | FL_LEASE))) 771 (~(FL_POSIX | FL_FLOCK | FL_SLEEP | FL_ACCESS | FL_LEASE)))
770 cFYI(1, ("Unknown lock flags 0x%x", pfLock->fl_flags)); 772 cFYI(1, "Unknown lock flags 0x%x", pfLock->fl_flags);
771 773
772 if (pfLock->fl_type == F_WRLCK) { 774 if (pfLock->fl_type == F_WRLCK) {
773 cFYI(1, ("F_WRLCK ")); 775 cFYI(1, "F_WRLCK ");
774 numLock = 1; 776 numLock = 1;
775 } else if (pfLock->fl_type == F_UNLCK) { 777 } else if (pfLock->fl_type == F_UNLCK) {
776 cFYI(1, ("F_UNLCK")); 778 cFYI(1, "F_UNLCK");
777 numUnlock = 1; 779 numUnlock = 1;
778 /* Check if unlock includes more than 780 /* Check if unlock includes more than
779 one lock range */ 781 one lock range */
780 } else if (pfLock->fl_type == F_RDLCK) { 782 } else if (pfLock->fl_type == F_RDLCK) {
781 cFYI(1, ("F_RDLCK")); 783 cFYI(1, "F_RDLCK");
782 lockType |= LOCKING_ANDX_SHARED_LOCK; 784 lockType |= LOCKING_ANDX_SHARED_LOCK;
783 numLock = 1; 785 numLock = 1;
784 } else if (pfLock->fl_type == F_EXLCK) { 786 } else if (pfLock->fl_type == F_EXLCK) {
785 cFYI(1, ("F_EXLCK")); 787 cFYI(1, "F_EXLCK");
786 numLock = 1; 788 numLock = 1;
787 } else if (pfLock->fl_type == F_SHLCK) { 789 } else if (pfLock->fl_type == F_SHLCK) {
788 cFYI(1, ("F_SHLCK")); 790 cFYI(1, "F_SHLCK");
789 lockType |= LOCKING_ANDX_SHARED_LOCK; 791 lockType |= LOCKING_ANDX_SHARED_LOCK;
790 numLock = 1; 792 numLock = 1;
791 } else 793 } else
792 cFYI(1, ("Unknown type of lock")); 794 cFYI(1, "Unknown type of lock");
793 795
794 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 796 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
795 tcon = cifs_sb->tcon; 797 tcon = cifs_sb->tcon;
@@ -832,14 +834,38 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
832 0 /* wait flag */ ); 834 0 /* wait flag */ );
833 pfLock->fl_type = F_UNLCK; 835 pfLock->fl_type = F_UNLCK;
834 if (rc != 0) 836 if (rc != 0)
835 cERROR(1, ("Error unlocking previously locked " 837 cERROR(1, "Error unlocking previously locked "
836 "range %d during test of lock", rc)); 838 "range %d during test of lock", rc);
837 rc = 0; 839 rc = 0;
838 840
839 } else { 841 } else {
840 /* if rc == ERR_SHARING_VIOLATION ? */ 842 /* if rc == ERR_SHARING_VIOLATION ? */
841 rc = 0; /* do not change lock type to unlock 843 rc = 0;
842 since range in use */ 844
845 if (lockType & LOCKING_ANDX_SHARED_LOCK) {
846 pfLock->fl_type = F_WRLCK;
847 } else {
848 rc = CIFSSMBLock(xid, tcon, netfid, length,
849 pfLock->fl_start, 0, 1,
850 lockType | LOCKING_ANDX_SHARED_LOCK,
851 0 /* wait flag */);
852 if (rc == 0) {
853 rc = CIFSSMBLock(xid, tcon, netfid,
854 length, pfLock->fl_start, 1, 0,
855 lockType |
856 LOCKING_ANDX_SHARED_LOCK,
857 0 /* wait flag */);
858 pfLock->fl_type = F_RDLCK;
859 if (rc != 0)
860 cERROR(1, "Error unlocking "
861 "previously locked range %d "
862 "during test of lock", rc);
863 rc = 0;
864 } else {
865 pfLock->fl_type = F_WRLCK;
866 rc = 0;
867 }
868 }
843 } 869 }
844 870
845 FreeXid(xid); 871 FreeXid(xid);
@@ -898,9 +924,10 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
898 1, 0, li->type, false); 924 1, 0, li->type, false);
899 if (stored_rc) 925 if (stored_rc)
900 rc = stored_rc; 926 rc = stored_rc;
901 927 else {
902 list_del(&li->llist); 928 list_del(&li->llist);
903 kfree(li); 929 kfree(li);
930 }
904 } 931 }
905 } 932 }
906 mutex_unlock(&fid->lock_mutex); 933 mutex_unlock(&fid->lock_mutex);
@@ -963,9 +990,8 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
963 990
964 pTcon = cifs_sb->tcon; 991 pTcon = cifs_sb->tcon;
965 992
966 /* cFYI(1, 993 /* cFYI(1, " write %d bytes to offset %lld of %s", write_size,
967 (" write %d bytes to offset %lld of %s", write_size, 994 *poffset, file->f_path.dentry->d_name.name); */
968 *poffset, file->f_path.dentry->d_name.name)); */
969 995
970 if (file->private_data == NULL) 996 if (file->private_data == NULL)
971 return -EBADF; 997 return -EBADF;
@@ -1066,8 +1092,8 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
1066 1092
1067 pTcon = cifs_sb->tcon; 1093 pTcon = cifs_sb->tcon;
1068 1094
1069 cFYI(1, ("write %zd bytes to offset %lld of %s", write_size, 1095 cFYI(1, "write %zd bytes to offset %lld of %s", write_size,
1070 *poffset, file->f_path.dentry->d_name.name)); 1096 *poffset, file->f_path.dentry->d_name.name);
1071 1097
1072 if (file->private_data == NULL) 1098 if (file->private_data == NULL)
1073 return -EBADF; 1099 return -EBADF;
@@ -1208,7 +1234,7 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode)
1208 it being zero) during stress testcases so we need to check for it */ 1234 it being zero) during stress testcases so we need to check for it */
1209 1235
1210 if (cifs_inode == NULL) { 1236 if (cifs_inode == NULL) {
1211 cERROR(1, ("Null inode passed to cifs_writeable_file")); 1237 cERROR(1, "Null inode passed to cifs_writeable_file");
1212 dump_stack(); 1238 dump_stack();
1213 return NULL; 1239 return NULL;
1214 } 1240 }
@@ -1252,7 +1278,7 @@ refind_writable:
1252 again. Note that it would be bad 1278 again. Note that it would be bad
1253 to hold up writepages here (rather than 1279 to hold up writepages here (rather than
1254 in caller) with continuous retries */ 1280 in caller) with continuous retries */
1255 cFYI(1, ("wp failed on reopen file")); 1281 cFYI(1, "wp failed on reopen file");
1256 read_lock(&GlobalSMBSeslock); 1282 read_lock(&GlobalSMBSeslock);
1257 /* can not use this handle, no write 1283 /* can not use this handle, no write
1258 pending on this one after all */ 1284 pending on this one after all */
@@ -1328,7 +1354,7 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
1328 else if (bytes_written < 0) 1354 else if (bytes_written < 0)
1329 rc = bytes_written; 1355 rc = bytes_written;
1330 } else { 1356 } else {
1331 cFYI(1, ("No writeable filehandles for inode")); 1357 cFYI(1, "No writeable filehandles for inode");
1332 rc = -EIO; 1358 rc = -EIO;
1333 } 1359 }
1334 1360
@@ -1500,7 +1526,7 @@ retry:
1500 */ 1526 */
1501 open_file = find_writable_file(CIFS_I(mapping->host)); 1527 open_file = find_writable_file(CIFS_I(mapping->host));
1502 if (!open_file) { 1528 if (!open_file) {
1503 cERROR(1, ("No writable handles for inode")); 1529 cERROR(1, "No writable handles for inode");
1504 rc = -EBADF; 1530 rc = -EBADF;
1505 } else { 1531 } else {
1506 long_op = cifs_write_timeout(cifsi, offset); 1532 long_op = cifs_write_timeout(cifsi, offset);
@@ -1513,8 +1539,8 @@ retry:
1513 cifs_update_eof(cifsi, offset, bytes_written); 1539 cifs_update_eof(cifsi, offset, bytes_written);
1514 1540
1515 if (rc || bytes_written < bytes_to_write) { 1541 if (rc || bytes_written < bytes_to_write) {
1516 cERROR(1, ("Write2 ret %d, wrote %d", 1542 cERROR(1, "Write2 ret %d, wrote %d",
1517 rc, bytes_written)); 1543 rc, bytes_written);
1518 /* BB what if continued retry is 1544 /* BB what if continued retry is
1519 requested via mount flags? */ 1545 requested via mount flags? */
1520 if (rc == -ENOSPC) 1546 if (rc == -ENOSPC)
@@ -1575,7 +1601,7 @@ static int cifs_writepage(struct page *page, struct writeback_control *wbc)
1575/* BB add check for wbc flags */ 1601/* BB add check for wbc flags */
1576 page_cache_get(page); 1602 page_cache_get(page);
1577 if (!PageUptodate(page)) 1603 if (!PageUptodate(page))
1578 cFYI(1, ("ppw - page not up to date")); 1604 cFYI(1, "ppw - page not up to date");
1579 1605
1580 /* 1606 /*
1581 * Set the "writeback" flag, and clear "dirty" in the radix tree. 1607 * Set the "writeback" flag, and clear "dirty" in the radix tree.
@@ -1604,8 +1630,8 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
1604 int rc; 1630 int rc;
1605 struct inode *inode = mapping->host; 1631 struct inode *inode = mapping->host;
1606 1632
1607 cFYI(1, ("write_end for page %p from pos %lld with %d bytes", 1633 cFYI(1, "write_end for page %p from pos %lld with %d bytes",
1608 page, pos, copied)); 1634 page, pos, copied);
1609 1635
1610 if (PageChecked(page)) { 1636 if (PageChecked(page)) {
1611 if (copied == len) 1637 if (copied == len)
@@ -1650,7 +1676,7 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
1650 return rc; 1676 return rc;
1651} 1677}
1652 1678
1653int cifs_fsync(struct file *file, struct dentry *dentry, int datasync) 1679int cifs_fsync(struct file *file, int datasync)
1654{ 1680{
1655 int xid; 1681 int xid;
1656 int rc = 0; 1682 int rc = 0;
@@ -1661,8 +1687,8 @@ int cifs_fsync(struct file *file, struct dentry *dentry, int datasync)
1661 1687
1662 xid = GetXid(); 1688 xid = GetXid();
1663 1689
1664 cFYI(1, ("Sync file - name: %s datasync: 0x%x", 1690 cFYI(1, "Sync file - name: %s datasync: 0x%x",
1665 dentry->d_name.name, datasync)); 1691 file->f_path.dentry->d_name.name, datasync);
1666 1692
1667 rc = filemap_write_and_wait(inode->i_mapping); 1693 rc = filemap_write_and_wait(inode->i_mapping);
1668 if (rc == 0) { 1694 if (rc == 0) {
@@ -1686,7 +1712,7 @@ int cifs_fsync(struct file *file, struct dentry *dentry, int datasync)
1686 unsigned int rpages = 0; 1712 unsigned int rpages = 0;
1687 int rc = 0; 1713 int rc = 0;
1688 1714
1689 cFYI(1, ("sync page %p",page)); 1715 cFYI(1, "sync page %p", page);
1690 mapping = page->mapping; 1716 mapping = page->mapping;
1691 if (!mapping) 1717 if (!mapping)
1692 return 0; 1718 return 0;
@@ -1697,7 +1723,7 @@ int cifs_fsync(struct file *file, struct dentry *dentry, int datasync)
1697/* fill in rpages then 1723/* fill in rpages then
1698 result = cifs_pagein_inode(inode, index, rpages); */ /* BB finish */ 1724 result = cifs_pagein_inode(inode, index, rpages); */ /* BB finish */
1699 1725
1700/* cFYI(1, ("rpages is %d for sync page of Index %ld", rpages, index)); 1726/* cFYI(1, "rpages is %d for sync page of Index %ld", rpages, index);
1701 1727
1702#if 0 1728#if 0
1703 if (rc < 0) 1729 if (rc < 0)
@@ -1731,7 +1757,7 @@ int cifs_flush(struct file *file, fl_owner_t id)
1731 CIFS_I(inode)->write_behind_rc = 0; 1757 CIFS_I(inode)->write_behind_rc = 0;
1732 } 1758 }
1733 1759
1734 cFYI(1, ("Flush inode %p file %p rc %d", inode, file, rc)); 1760 cFYI(1, "Flush inode %p file %p rc %d", inode, file, rc);
1735 1761
1736 return rc; 1762 return rc;
1737} 1763}
@@ -1763,7 +1789,7 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data,
1763 open_file = (struct cifsFileInfo *)file->private_data; 1789 open_file = (struct cifsFileInfo *)file->private_data;
1764 1790
1765 if ((file->f_flags & O_ACCMODE) == O_WRONLY) 1791 if ((file->f_flags & O_ACCMODE) == O_WRONLY)
1766 cFYI(1, ("attempting read on write only file instance")); 1792 cFYI(1, "attempting read on write only file instance");
1767 1793
1768 for (total_read = 0, current_offset = read_data; 1794 for (total_read = 0, current_offset = read_data;
1769 read_size > total_read; 1795 read_size > total_read;
@@ -1844,7 +1870,7 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
1844 open_file = (struct cifsFileInfo *)file->private_data; 1870 open_file = (struct cifsFileInfo *)file->private_data;
1845 1871
1846 if ((file->f_flags & O_ACCMODE) == O_WRONLY) 1872 if ((file->f_flags & O_ACCMODE) == O_WRONLY)
1847 cFYI(1, ("attempting read on write only file instance")); 1873 cFYI(1, "attempting read on write only file instance");
1848 1874
1849 for (total_read = 0, current_offset = read_data; 1875 for (total_read = 0, current_offset = read_data;
1850 read_size > total_read; 1876 read_size > total_read;
@@ -1895,7 +1921,7 @@ int cifs_file_mmap(struct file *file, struct vm_area_struct *vma)
1895 xid = GetXid(); 1921 xid = GetXid();
1896 rc = cifs_revalidate_file(file); 1922 rc = cifs_revalidate_file(file);
1897 if (rc) { 1923 if (rc) {
1898 cFYI(1, ("Validation prior to mmap failed, error=%d", rc)); 1924 cFYI(1, "Validation prior to mmap failed, error=%d", rc);
1899 FreeXid(xid); 1925 FreeXid(xid);
1900 return rc; 1926 return rc;
1901 } 1927 }
@@ -1906,8 +1932,7 @@ int cifs_file_mmap(struct file *file, struct vm_area_struct *vma)
1906 1932
1907 1933
1908static void cifs_copy_cache_pages(struct address_space *mapping, 1934static void cifs_copy_cache_pages(struct address_space *mapping,
1909 struct list_head *pages, int bytes_read, char *data, 1935 struct list_head *pages, int bytes_read, char *data)
1910 struct pagevec *plru_pvec)
1911{ 1936{
1912 struct page *page; 1937 struct page *page;
1913 char *target; 1938 char *target;
@@ -1919,10 +1944,10 @@ static void cifs_copy_cache_pages(struct address_space *mapping,
1919 page = list_entry(pages->prev, struct page, lru); 1944 page = list_entry(pages->prev, struct page, lru);
1920 list_del(&page->lru); 1945 list_del(&page->lru);
1921 1946
1922 if (add_to_page_cache(page, mapping, page->index, 1947 if (add_to_page_cache_lru(page, mapping, page->index,
1923 GFP_KERNEL)) { 1948 GFP_KERNEL)) {
1924 page_cache_release(page); 1949 page_cache_release(page);
1925 cFYI(1, ("Add page cache failed")); 1950 cFYI(1, "Add page cache failed");
1926 data += PAGE_CACHE_SIZE; 1951 data += PAGE_CACHE_SIZE;
1927 bytes_read -= PAGE_CACHE_SIZE; 1952 bytes_read -= PAGE_CACHE_SIZE;
1928 continue; 1953 continue;
@@ -1945,8 +1970,6 @@ static void cifs_copy_cache_pages(struct address_space *mapping,
1945 flush_dcache_page(page); 1970 flush_dcache_page(page);
1946 SetPageUptodate(page); 1971 SetPageUptodate(page);
1947 unlock_page(page); 1972 unlock_page(page);
1948 if (!pagevec_add(plru_pvec, page))
1949 __pagevec_lru_add_file(plru_pvec);
1950 data += PAGE_CACHE_SIZE; 1973 data += PAGE_CACHE_SIZE;
1951 } 1974 }
1952 return; 1975 return;
@@ -1965,7 +1988,6 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
1965 unsigned int read_size, i; 1988 unsigned int read_size, i;
1966 char *smb_read_data = NULL; 1989 char *smb_read_data = NULL;
1967 struct smb_com_read_rsp *pSMBr; 1990 struct smb_com_read_rsp *pSMBr;
1968 struct pagevec lru_pvec;
1969 struct cifsFileInfo *open_file; 1991 struct cifsFileInfo *open_file;
1970 int buf_type = CIFS_NO_BUFFER; 1992 int buf_type = CIFS_NO_BUFFER;
1971 1993
@@ -1979,8 +2001,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
1979 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 2001 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
1980 pTcon = cifs_sb->tcon; 2002 pTcon = cifs_sb->tcon;
1981 2003
1982 pagevec_init(&lru_pvec, 0); 2004 cFYI(DBG2, "rpages: num pages %d", num_pages);
1983 cFYI(DBG2, ("rpages: num pages %d", num_pages));
1984 for (i = 0; i < num_pages; ) { 2005 for (i = 0; i < num_pages; ) {
1985 unsigned contig_pages; 2006 unsigned contig_pages;
1986 struct page *tmp_page; 2007 struct page *tmp_page;
@@ -2013,8 +2034,8 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
2013 /* Read size needs to be in multiples of one page */ 2034 /* Read size needs to be in multiples of one page */
2014 read_size = min_t(const unsigned int, read_size, 2035 read_size = min_t(const unsigned int, read_size,
2015 cifs_sb->rsize & PAGE_CACHE_MASK); 2036 cifs_sb->rsize & PAGE_CACHE_MASK);
2016 cFYI(DBG2, ("rpages: read size 0x%x contiguous pages %d", 2037 cFYI(DBG2, "rpages: read size 0x%x contiguous pages %d",
2017 read_size, contig_pages)); 2038 read_size, contig_pages);
2018 rc = -EAGAIN; 2039 rc = -EAGAIN;
2019 while (rc == -EAGAIN) { 2040 while (rc == -EAGAIN) {
2020 if ((open_file->invalidHandle) && 2041 if ((open_file->invalidHandle) &&
@@ -2041,14 +2062,14 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
2041 } 2062 }
2042 } 2063 }
2043 if ((rc < 0) || (smb_read_data == NULL)) { 2064 if ((rc < 0) || (smb_read_data == NULL)) {
2044 cFYI(1, ("Read error in readpages: %d", rc)); 2065 cFYI(1, "Read error in readpages: %d", rc);
2045 break; 2066 break;
2046 } else if (bytes_read > 0) { 2067 } else if (bytes_read > 0) {
2047 task_io_account_read(bytes_read); 2068 task_io_account_read(bytes_read);
2048 pSMBr = (struct smb_com_read_rsp *)smb_read_data; 2069 pSMBr = (struct smb_com_read_rsp *)smb_read_data;
2049 cifs_copy_cache_pages(mapping, page_list, bytes_read, 2070 cifs_copy_cache_pages(mapping, page_list, bytes_read,
2050 smb_read_data + 4 /* RFC1001 hdr */ + 2071 smb_read_data + 4 /* RFC1001 hdr */ +
2051 le16_to_cpu(pSMBr->DataOffset), &lru_pvec); 2072 le16_to_cpu(pSMBr->DataOffset));
2052 2073
2053 i += bytes_read >> PAGE_CACHE_SHIFT; 2074 i += bytes_read >> PAGE_CACHE_SHIFT;
2054 cifs_stats_bytes_read(pTcon, bytes_read); 2075 cifs_stats_bytes_read(pTcon, bytes_read);
@@ -2064,9 +2085,9 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
2064 /* break; */ 2085 /* break; */
2065 } 2086 }
2066 } else { 2087 } else {
2067 cFYI(1, ("No bytes read (%d) at offset %lld . " 2088 cFYI(1, "No bytes read (%d) at offset %lld . "
2068 "Cleaning remaining pages from readahead list", 2089 "Cleaning remaining pages from readahead list",
2069 bytes_read, offset)); 2090 bytes_read, offset);
2070 /* BB turn off caching and do new lookup on 2091 /* BB turn off caching and do new lookup on
2071 file size at server? */ 2092 file size at server? */
2072 break; 2093 break;
@@ -2081,8 +2102,6 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
2081 bytes_read = 0; 2102 bytes_read = 0;
2082 } 2103 }
2083 2104
2084 pagevec_lru_add_file(&lru_pvec);
2085
2086/* need to free smb_read_data buf before exit */ 2105/* need to free smb_read_data buf before exit */
2087 if (smb_read_data) { 2106 if (smb_read_data) {
2088 if (buf_type == CIFS_SMALL_BUFFER) 2107 if (buf_type == CIFS_SMALL_BUFFER)
@@ -2111,7 +2130,7 @@ static int cifs_readpage_worker(struct file *file, struct page *page,
2111 if (rc < 0) 2130 if (rc < 0)
2112 goto io_error; 2131 goto io_error;
2113 else 2132 else
2114 cFYI(1, ("Bytes read %d", rc)); 2133 cFYI(1, "Bytes read %d", rc);
2115 2134
2116 file->f_path.dentry->d_inode->i_atime = 2135 file->f_path.dentry->d_inode->i_atime =
2117 current_fs_time(file->f_path.dentry->d_inode->i_sb); 2136 current_fs_time(file->f_path.dentry->d_inode->i_sb);
@@ -2143,8 +2162,8 @@ static int cifs_readpage(struct file *file, struct page *page)
2143 return rc; 2162 return rc;
2144 } 2163 }
2145 2164
2146 cFYI(1, ("readpage %p at offset %d 0x%x\n", 2165 cFYI(1, "readpage %p at offset %d 0x%x\n",
2147 page, (int)offset, (int)offset)); 2166 page, (int)offset, (int)offset);
2148 2167
2149 rc = cifs_readpage_worker(file, page, &offset); 2168 rc = cifs_readpage_worker(file, page, &offset);
2150 2169
@@ -2214,7 +2233,7 @@ static int cifs_write_begin(struct file *file, struct address_space *mapping,
2214 struct page *page; 2233 struct page *page;
2215 int rc = 0; 2234 int rc = 0;
2216 2235
2217 cFYI(1, ("write_begin from %lld len %d", (long long)pos, len)); 2236 cFYI(1, "write_begin from %lld len %d", (long long)pos, len);
2218 2237
2219 page = grab_cache_page_write_begin(mapping, index, flags); 2238 page = grab_cache_page_write_begin(mapping, index, flags);
2220 if (!page) { 2239 if (!page) {
@@ -2286,12 +2305,10 @@ cifs_oplock_break(struct slow_work *work)
2286 int rc, waitrc = 0; 2305 int rc, waitrc = 0;
2287 2306
2288 if (inode && S_ISREG(inode->i_mode)) { 2307 if (inode && S_ISREG(inode->i_mode)) {
2289#ifdef CONFIG_CIFS_EXPERIMENTAL 2308 if (cinode->clientCanCacheRead)
2290 if (cinode->clientCanCacheAll == 0)
2291 break_lease(inode, O_RDONLY); 2309 break_lease(inode, O_RDONLY);
2292 else if (cinode->clientCanCacheRead == 0) 2310 else
2293 break_lease(inode, O_WRONLY); 2311 break_lease(inode, O_WRONLY);
2294#endif
2295 rc = filemap_fdatawrite(inode->i_mapping); 2312 rc = filemap_fdatawrite(inode->i_mapping);
2296 if (cinode->clientCanCacheRead == 0) { 2313 if (cinode->clientCanCacheRead == 0) {
2297 waitrc = filemap_fdatawait(inode->i_mapping); 2314 waitrc = filemap_fdatawait(inode->i_mapping);
@@ -2301,7 +2318,7 @@ cifs_oplock_break(struct slow_work *work)
2301 rc = waitrc; 2318 rc = waitrc;
2302 if (rc) 2319 if (rc)
2303 cinode->write_behind_rc = rc; 2320 cinode->write_behind_rc = rc;
2304 cFYI(1, ("Oplock flush inode %p rc %d", inode, rc)); 2321 cFYI(1, "Oplock flush inode %p rc %d", inode, rc);
2305 } 2322 }
2306 2323
2307 /* 2324 /*
@@ -2313,7 +2330,7 @@ cifs_oplock_break(struct slow_work *work)
2313 if (!cfile->closePend && !cfile->oplock_break_cancelled) { 2330 if (!cfile->closePend && !cfile->oplock_break_cancelled) {
2314 rc = CIFSSMBLock(0, cifs_sb->tcon, cfile->netfid, 0, 0, 0, 0, 2331 rc = CIFSSMBLock(0, cifs_sb->tcon, cfile->netfid, 0, 0, 0, 0,
2315 LOCKING_ANDX_OPLOCK_RELEASE, false); 2332 LOCKING_ANDX_OPLOCK_RELEASE, false);
2316 cFYI(1, ("Oplock release rc = %d", rc)); 2333 cFYI(1, "Oplock release rc = %d", rc);
2317 } 2334 }
2318} 2335}
2319 2336
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 723daaccbd0e..62b324f26a56 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * fs/cifs/inode.c 2 * fs/cifs/inode.c
3 * 3 *
4 * Copyright (C) International Business Machines Corp., 2002,2008 4 * Copyright (C) International Business Machines Corp., 2002,2010
5 * Author(s): Steve French (sfrench@us.ibm.com) 5 * Author(s): Steve French (sfrench@us.ibm.com)
6 * 6 *
7 * This library is free software; you can redistribute it and/or modify 7 * This library is free software; you can redistribute it and/or modify
@@ -20,6 +20,7 @@
20 */ 20 */
21#include <linux/fs.h> 21#include <linux/fs.h>
22#include <linux/stat.h> 22#include <linux/stat.h>
23#include <linux/slab.h>
23#include <linux/pagemap.h> 24#include <linux/pagemap.h>
24#include <asm/div64.h> 25#include <asm/div64.h>
25#include "cifsfs.h" 26#include "cifsfs.h"
@@ -85,30 +86,30 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr)
85{ 86{
86 struct cifsInodeInfo *cifs_i = CIFS_I(inode); 87 struct cifsInodeInfo *cifs_i = CIFS_I(inode);
87 88
88 cFYI(1, ("%s: revalidating inode %llu", __func__, cifs_i->uniqueid)); 89 cFYI(1, "%s: revalidating inode %llu", __func__, cifs_i->uniqueid);
89 90
90 if (inode->i_state & I_NEW) { 91 if (inode->i_state & I_NEW) {
91 cFYI(1, ("%s: inode %llu is new", __func__, cifs_i->uniqueid)); 92 cFYI(1, "%s: inode %llu is new", __func__, cifs_i->uniqueid);
92 return; 93 return;
93 } 94 }
94 95
95 /* don't bother with revalidation if we have an oplock */ 96 /* don't bother with revalidation if we have an oplock */
96 if (cifs_i->clientCanCacheRead) { 97 if (cifs_i->clientCanCacheRead) {
97 cFYI(1, ("%s: inode %llu is oplocked", __func__, 98 cFYI(1, "%s: inode %llu is oplocked", __func__,
98 cifs_i->uniqueid)); 99 cifs_i->uniqueid);
99 return; 100 return;
100 } 101 }
101 102
102 /* revalidate if mtime or size have changed */ 103 /* revalidate if mtime or size have changed */
103 if (timespec_equal(&inode->i_mtime, &fattr->cf_mtime) && 104 if (timespec_equal(&inode->i_mtime, &fattr->cf_mtime) &&
104 cifs_i->server_eof == fattr->cf_eof) { 105 cifs_i->server_eof == fattr->cf_eof) {
105 cFYI(1, ("%s: inode %llu is unchanged", __func__, 106 cFYI(1, "%s: inode %llu is unchanged", __func__,
106 cifs_i->uniqueid)); 107 cifs_i->uniqueid);
107 return; 108 return;
108 } 109 }
109 110
110 cFYI(1, ("%s: invalidating inode %llu mapping", __func__, 111 cFYI(1, "%s: invalidating inode %llu mapping", __func__,
111 cifs_i->uniqueid)); 112 cifs_i->uniqueid);
112 cifs_i->invalid_mapping = true; 113 cifs_i->invalid_mapping = true;
113} 114}
114 115
@@ -136,15 +137,14 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
136 inode->i_mode = fattr->cf_mode; 137 inode->i_mode = fattr->cf_mode;
137 138
138 cifs_i->cifsAttrs = fattr->cf_cifsattrs; 139 cifs_i->cifsAttrs = fattr->cf_cifsattrs;
139 cifs_i->uniqueid = fattr->cf_uniqueid;
140 140
141 if (fattr->cf_flags & CIFS_FATTR_NEED_REVAL) 141 if (fattr->cf_flags & CIFS_FATTR_NEED_REVAL)
142 cifs_i->time = 0; 142 cifs_i->time = 0;
143 else 143 else
144 cifs_i->time = jiffies; 144 cifs_i->time = jiffies;
145 145
146 cFYI(1, ("inode 0x%p old_time=%ld new_time=%ld", inode, 146 cFYI(1, "inode 0x%p old_time=%ld new_time=%ld", inode,
147 oldtime, cifs_i->time)); 147 oldtime, cifs_i->time);
148 148
149 cifs_i->delete_pending = fattr->cf_flags & CIFS_FATTR_DELETE_PENDING; 149 cifs_i->delete_pending = fattr->cf_flags & CIFS_FATTR_DELETE_PENDING;
150 150
@@ -169,6 +169,17 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
169 cifs_set_ops(inode, fattr->cf_flags & CIFS_FATTR_DFS_REFERRAL); 169 cifs_set_ops(inode, fattr->cf_flags & CIFS_FATTR_DFS_REFERRAL);
170} 170}
171 171
172void
173cifs_fill_uniqueid(struct super_block *sb, struct cifs_fattr *fattr)
174{
175 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
176
177 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)
178 return;
179
180 fattr->cf_uniqueid = iunique(sb, ROOT_I);
181}
182
172/* Fill a cifs_fattr struct with info from FILE_UNIX_BASIC_INFO. */ 183/* Fill a cifs_fattr struct with info from FILE_UNIX_BASIC_INFO. */
173void 184void
174cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, FILE_UNIX_BASIC_INFO *info, 185cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, FILE_UNIX_BASIC_INFO *info,
@@ -226,7 +237,7 @@ cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, FILE_UNIX_BASIC_INFO *info,
226 /* safest to call it a file if we do not know */ 237 /* safest to call it a file if we do not know */
227 fattr->cf_mode |= S_IFREG; 238 fattr->cf_mode |= S_IFREG;
228 fattr->cf_dtype = DT_REG; 239 fattr->cf_dtype = DT_REG;
229 cFYI(1, ("unknown type %d", le32_to_cpu(info->Type))); 240 cFYI(1, "unknown type %d", le32_to_cpu(info->Type));
230 break; 241 break;
231 } 242 }
232 243
@@ -255,7 +266,7 @@ cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb)
255{ 266{
256 struct cifs_sb_info *cifs_sb = CIFS_SB(sb); 267 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
257 268
258 cFYI(1, ("creating fake fattr for DFS referral")); 269 cFYI(1, "creating fake fattr for DFS referral");
259 270
260 memset(fattr, 0, sizeof(*fattr)); 271 memset(fattr, 0, sizeof(*fattr));
261 fattr->cf_mode = S_IFDIR | S_IXUGO | S_IRWXU; 272 fattr->cf_mode = S_IFDIR | S_IXUGO | S_IRWXU;
@@ -304,7 +315,7 @@ int cifs_get_inode_info_unix(struct inode **pinode,
304 struct cifs_sb_info *cifs_sb = CIFS_SB(sb); 315 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
305 316
306 tcon = cifs_sb->tcon; 317 tcon = cifs_sb->tcon;
307 cFYI(1, ("Getting info on %s", full_path)); 318 cFYI(1, "Getting info on %s", full_path);
308 319
309 /* could have done a find first instead but this returns more info */ 320 /* could have done a find first instead but this returns more info */
310 rc = CIFSSMBUnixQPathInfo(xid, tcon, full_path, &find_data, 321 rc = CIFSSMBUnixQPathInfo(xid, tcon, full_path, &find_data,
@@ -322,6 +333,7 @@ int cifs_get_inode_info_unix(struct inode **pinode,
322 333
323 if (*pinode == NULL) { 334 if (*pinode == NULL) {
324 /* get new inode */ 335 /* get new inode */
336 cifs_fill_uniqueid(sb, &fattr);
325 *pinode = cifs_iget(sb, &fattr); 337 *pinode = cifs_iget(sb, &fattr);
326 if (!*pinode) 338 if (!*pinode)
327 rc = -ENOMEM; 339 rc = -ENOMEM;
@@ -372,7 +384,7 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
372 &bytes_read, &pbuf, &buf_type); 384 &bytes_read, &pbuf, &buf_type);
373 if ((rc == 0) && (bytes_read >= 8)) { 385 if ((rc == 0) && (bytes_read >= 8)) {
374 if (memcmp("IntxBLK", pbuf, 8) == 0) { 386 if (memcmp("IntxBLK", pbuf, 8) == 0) {
375 cFYI(1, ("Block device")); 387 cFYI(1, "Block device");
376 fattr->cf_mode |= S_IFBLK; 388 fattr->cf_mode |= S_IFBLK;
377 fattr->cf_dtype = DT_BLK; 389 fattr->cf_dtype = DT_BLK;
378 if (bytes_read == 24) { 390 if (bytes_read == 24) {
@@ -384,7 +396,7 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
384 fattr->cf_rdev = MKDEV(mjr, mnr); 396 fattr->cf_rdev = MKDEV(mjr, mnr);
385 } 397 }
386 } else if (memcmp("IntxCHR", pbuf, 8) == 0) { 398 } else if (memcmp("IntxCHR", pbuf, 8) == 0) {
387 cFYI(1, ("Char device")); 399 cFYI(1, "Char device");
388 fattr->cf_mode |= S_IFCHR; 400 fattr->cf_mode |= S_IFCHR;
389 fattr->cf_dtype = DT_CHR; 401 fattr->cf_dtype = DT_CHR;
390 if (bytes_read == 24) { 402 if (bytes_read == 24) {
@@ -396,7 +408,7 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
396 fattr->cf_rdev = MKDEV(mjr, mnr); 408 fattr->cf_rdev = MKDEV(mjr, mnr);
397 } 409 }
398 } else if (memcmp("IntxLNK", pbuf, 7) == 0) { 410 } else if (memcmp("IntxLNK", pbuf, 7) == 0) {
399 cFYI(1, ("Symlink")); 411 cFYI(1, "Symlink");
400 fattr->cf_mode |= S_IFLNK; 412 fattr->cf_mode |= S_IFLNK;
401 fattr->cf_dtype = DT_LNK; 413 fattr->cf_dtype = DT_LNK;
402 } else { 414 } else {
@@ -438,10 +450,10 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
438 else if (rc > 3) { 450 else if (rc > 3) {
439 mode = le32_to_cpu(*((__le32 *)ea_value)); 451 mode = le32_to_cpu(*((__le32 *)ea_value));
440 fattr->cf_mode &= ~SFBITS_MASK; 452 fattr->cf_mode &= ~SFBITS_MASK;
441 cFYI(1, ("special bits 0%o org mode 0%o", mode, 453 cFYI(1, "special bits 0%o org mode 0%o", mode,
442 fattr->cf_mode)); 454 fattr->cf_mode);
443 fattr->cf_mode = (mode & SFBITS_MASK) | fattr->cf_mode; 455 fattr->cf_mode = (mode & SFBITS_MASK) | fattr->cf_mode;
444 cFYI(1, ("special mode bits 0%o", mode)); 456 cFYI(1, "special mode bits 0%o", mode);
445 } 457 }
446 458
447 return 0; 459 return 0;
@@ -547,11 +559,11 @@ int cifs_get_inode_info(struct inode **pinode,
547 struct cifs_fattr fattr; 559 struct cifs_fattr fattr;
548 560
549 pTcon = cifs_sb->tcon; 561 pTcon = cifs_sb->tcon;
550 cFYI(1, ("Getting info on %s", full_path)); 562 cFYI(1, "Getting info on %s", full_path);
551 563
552 if ((pfindData == NULL) && (*pinode != NULL)) { 564 if ((pfindData == NULL) && (*pinode != NULL)) {
553 if (CIFS_I(*pinode)->clientCanCacheRead) { 565 if (CIFS_I(*pinode)->clientCanCacheRead) {
554 cFYI(1, ("No need to revalidate cached inode sizes")); 566 cFYI(1, "No need to revalidate cached inode sizes");
555 return rc; 567 return rc;
556 } 568 }
557 } 569 }
@@ -617,7 +629,7 @@ int cifs_get_inode_info(struct inode **pinode,
617 cifs_sb->mnt_cifs_flags & 629 cifs_sb->mnt_cifs_flags &
618 CIFS_MOUNT_MAP_SPECIAL_CHR); 630 CIFS_MOUNT_MAP_SPECIAL_CHR);
619 if (rc1 || !fattr.cf_uniqueid) { 631 if (rc1 || !fattr.cf_uniqueid) {
620 cFYI(1, ("GetSrvInodeNum rc %d", rc1)); 632 cFYI(1, "GetSrvInodeNum rc %d", rc1);
621 fattr.cf_uniqueid = iunique(sb, ROOT_I); 633 fattr.cf_uniqueid = iunique(sb, ROOT_I);
622 cifs_autodisable_serverino(cifs_sb); 634 cifs_autodisable_serverino(cifs_sb);
623 } 635 }
@@ -633,13 +645,13 @@ int cifs_get_inode_info(struct inode **pinode,
633 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) { 645 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
634 tmprc = cifs_sfu_type(&fattr, full_path, cifs_sb, xid); 646 tmprc = cifs_sfu_type(&fattr, full_path, cifs_sb, xid);
635 if (tmprc) 647 if (tmprc)
636 cFYI(1, ("cifs_sfu_type failed: %d", tmprc)); 648 cFYI(1, "cifs_sfu_type failed: %d", tmprc);
637 } 649 }
638 650
639#ifdef CONFIG_CIFS_EXPERIMENTAL 651#ifdef CONFIG_CIFS_EXPERIMENTAL
640 /* fill in 0777 bits from ACL */ 652 /* fill in 0777 bits from ACL */
641 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) { 653 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
642 cFYI(1, ("Getting mode bits from ACL")); 654 cFYI(1, "Getting mode bits from ACL");
643 cifs_acl_to_fattr(cifs_sb, &fattr, *pinode, full_path, pfid); 655 cifs_acl_to_fattr(cifs_sb, &fattr, *pinode, full_path, pfid);
644 } 656 }
645#endif 657#endif
@@ -714,6 +726,16 @@ cifs_find_inode(struct inode *inode, void *opaque)
714 if (CIFS_I(inode)->uniqueid != fattr->cf_uniqueid) 726 if (CIFS_I(inode)->uniqueid != fattr->cf_uniqueid)
715 return 0; 727 return 0;
716 728
729 /*
730 * uh oh -- it's a directory. We can't use it since hardlinked dirs are
731 * verboten. Disable serverino and return it as if it were found, the
732 * caller can discard it, generate a uniqueid and retry the find
733 */
734 if (S_ISDIR(inode->i_mode) && !list_empty(&inode->i_dentry)) {
735 fattr->cf_flags |= CIFS_FATTR_INO_COLLISION;
736 cifs_autodisable_serverino(CIFS_SB(inode->i_sb));
737 }
738
717 return 1; 739 return 1;
718} 740}
719 741
@@ -733,15 +755,22 @@ cifs_iget(struct super_block *sb, struct cifs_fattr *fattr)
733 unsigned long hash; 755 unsigned long hash;
734 struct inode *inode; 756 struct inode *inode;
735 757
736 cFYI(1, ("looking for uniqueid=%llu", fattr->cf_uniqueid)); 758retry_iget5_locked:
759 cFYI(1, "looking for uniqueid=%llu", fattr->cf_uniqueid);
737 760
738 /* hash down to 32-bits on 32-bit arch */ 761 /* hash down to 32-bits on 32-bit arch */
739 hash = cifs_uniqueid_to_ino_t(fattr->cf_uniqueid); 762 hash = cifs_uniqueid_to_ino_t(fattr->cf_uniqueid);
740 763
741 inode = iget5_locked(sb, hash, cifs_find_inode, cifs_init_inode, fattr); 764 inode = iget5_locked(sb, hash, cifs_find_inode, cifs_init_inode, fattr);
742
743 /* we have fattrs in hand, update the inode */
744 if (inode) { 765 if (inode) {
766 /* was there a problematic inode number collision? */
767 if (fattr->cf_flags & CIFS_FATTR_INO_COLLISION) {
768 iput(inode);
769 fattr->cf_uniqueid = iunique(sb, ROOT_I);
770 fattr->cf_flags &= ~CIFS_FATTR_INO_COLLISION;
771 goto retry_iget5_locked;
772 }
773
745 cifs_fattr_to_inode(inode, fattr); 774 cifs_fattr_to_inode(inode, fattr);
746 if (sb->s_flags & MS_NOATIME) 775 if (sb->s_flags & MS_NOATIME)
747 inode->i_flags |= S_NOATIME | S_NOCMTIME; 776 inode->i_flags |= S_NOATIME | S_NOCMTIME;
@@ -779,7 +808,7 @@ struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
779 return ERR_PTR(-ENOMEM); 808 return ERR_PTR(-ENOMEM);
780 809
781 if (rc && cifs_sb->tcon->ipc) { 810 if (rc && cifs_sb->tcon->ipc) {
782 cFYI(1, ("ipc connection - fake read inode")); 811 cFYI(1, "ipc connection - fake read inode");
783 inode->i_mode |= S_IFDIR; 812 inode->i_mode |= S_IFDIR;
784 inode->i_nlink = 2; 813 inode->i_nlink = 2;
785 inode->i_op = &cifs_ipc_inode_ops; 814 inode->i_op = &cifs_ipc_inode_ops;
@@ -841,7 +870,7 @@ cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid,
841 * server times. 870 * server times.
842 */ 871 */
843 if (set_time && (attrs->ia_valid & ATTR_CTIME)) { 872 if (set_time && (attrs->ia_valid & ATTR_CTIME)) {
844 cFYI(1, ("CIFS - CTIME changed")); 873 cFYI(1, "CIFS - CTIME changed");
845 info_buf.ChangeTime = 874 info_buf.ChangeTime =
846 cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_ctime)); 875 cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_ctime));
847 } else 876 } else
@@ -876,8 +905,8 @@ cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid,
876 goto out; 905 goto out;
877 } 906 }
878 907
879 cFYI(1, ("calling SetFileInfo since SetPathInfo for " 908 cFYI(1, "calling SetFileInfo since SetPathInfo for "
880 "times not supported by this server")); 909 "times not supported by this server");
881 rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN, 910 rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN,
882 SYNCHRONIZE | FILE_WRITE_ATTRIBUTES, 911 SYNCHRONIZE | FILE_WRITE_ATTRIBUTES,
883 CREATE_NOT_DIR, &netfid, &oplock, 912 CREATE_NOT_DIR, &netfid, &oplock,
@@ -1035,7 +1064,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
1035 struct iattr *attrs = NULL; 1064 struct iattr *attrs = NULL;
1036 __u32 dosattr = 0, origattr = 0; 1065 __u32 dosattr = 0, origattr = 0;
1037 1066
1038 cFYI(1, ("cifs_unlink, dir=0x%p, dentry=0x%p", dir, dentry)); 1067 cFYI(1, "cifs_unlink, dir=0x%p, dentry=0x%p", dir, dentry);
1039 1068
1040 xid = GetXid(); 1069 xid = GetXid();
1041 1070
@@ -1054,7 +1083,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
1054 rc = CIFSPOSIXDelFile(xid, tcon, full_path, 1083 rc = CIFSPOSIXDelFile(xid, tcon, full_path,
1055 SMB_POSIX_UNLINK_FILE_TARGET, cifs_sb->local_nls, 1084 SMB_POSIX_UNLINK_FILE_TARGET, cifs_sb->local_nls,
1056 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 1085 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
1057 cFYI(1, ("posix del rc %d", rc)); 1086 cFYI(1, "posix del rc %d", rc);
1058 if ((rc == 0) || (rc == -ENOENT)) 1087 if ((rc == 0) || (rc == -ENOENT))
1059 goto psx_del_no_retry; 1088 goto psx_del_no_retry;
1060 } 1089 }
@@ -1128,7 +1157,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
1128 struct inode *newinode = NULL; 1157 struct inode *newinode = NULL;
1129 struct cifs_fattr fattr; 1158 struct cifs_fattr fattr;
1130 1159
1131 cFYI(1, ("In cifs_mkdir, mode = 0x%x inode = 0x%p", mode, inode)); 1160 cFYI(1, "In cifs_mkdir, mode = 0x%x inode = 0x%p", mode, inode);
1132 1161
1133 xid = GetXid(); 1162 xid = GetXid();
1134 1163
@@ -1163,7 +1192,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
1163 kfree(pInfo); 1192 kfree(pInfo);
1164 goto mkdir_retry_old; 1193 goto mkdir_retry_old;
1165 } else if (rc) { 1194 } else if (rc) {
1166 cFYI(1, ("posix mkdir returned 0x%x", rc)); 1195 cFYI(1, "posix mkdir returned 0x%x", rc);
1167 d_drop(direntry); 1196 d_drop(direntry);
1168 } else { 1197 } else {
1169 if (pInfo->Type == cpu_to_le32(-1)) { 1198 if (pInfo->Type == cpu_to_le32(-1)) {
@@ -1180,6 +1209,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
1180 direntry->d_op = &cifs_dentry_ops; 1209 direntry->d_op = &cifs_dentry_ops;
1181 1210
1182 cifs_unix_basic_to_fattr(&fattr, pInfo, cifs_sb); 1211 cifs_unix_basic_to_fattr(&fattr, pInfo, cifs_sb);
1212 cifs_fill_uniqueid(inode->i_sb, &fattr);
1183 newinode = cifs_iget(inode->i_sb, &fattr); 1213 newinode = cifs_iget(inode->i_sb, &fattr);
1184 if (!newinode) { 1214 if (!newinode) {
1185 kfree(pInfo); 1215 kfree(pInfo);
@@ -1189,12 +1219,12 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
1189 d_instantiate(direntry, newinode); 1219 d_instantiate(direntry, newinode);
1190 1220
1191#ifdef CONFIG_CIFS_DEBUG2 1221#ifdef CONFIG_CIFS_DEBUG2
1192 cFYI(1, ("instantiated dentry %p %s to inode %p", 1222 cFYI(1, "instantiated dentry %p %s to inode %p",
1193 direntry, direntry->d_name.name, newinode)); 1223 direntry, direntry->d_name.name, newinode);
1194 1224
1195 if (newinode->i_nlink != 2) 1225 if (newinode->i_nlink != 2)
1196 cFYI(1, ("unexpected number of links %d", 1226 cFYI(1, "unexpected number of links %d",
1197 newinode->i_nlink)); 1227 newinode->i_nlink);
1198#endif 1228#endif
1199 } 1229 }
1200 kfree(pInfo); 1230 kfree(pInfo);
@@ -1205,7 +1235,7 @@ mkdir_retry_old:
1205 rc = CIFSSMBMkDir(xid, pTcon, full_path, cifs_sb->local_nls, 1235 rc = CIFSSMBMkDir(xid, pTcon, full_path, cifs_sb->local_nls,
1206 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 1236 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
1207 if (rc) { 1237 if (rc) {
1208 cFYI(1, ("cifs_mkdir returned 0x%x", rc)); 1238 cFYI(1, "cifs_mkdir returned 0x%x", rc);
1209 d_drop(direntry); 1239 d_drop(direntry);
1210 } else { 1240 } else {
1211mkdir_get_info: 1241mkdir_get_info:
@@ -1308,7 +1338,7 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
1308 char *full_path = NULL; 1338 char *full_path = NULL;
1309 struct cifsInodeInfo *cifsInode; 1339 struct cifsInodeInfo *cifsInode;
1310 1340
1311 cFYI(1, ("cifs_rmdir, inode = 0x%p", inode)); 1341 cFYI(1, "cifs_rmdir, inode = 0x%p", inode);
1312 1342
1313 xid = GetXid(); 1343 xid = GetXid();
1314 1344
@@ -1510,6 +1540,11 @@ cifs_inode_needs_reval(struct inode *inode)
1510 if (time_after_eq(jiffies, cifs_i->time + HZ)) 1540 if (time_after_eq(jiffies, cifs_i->time + HZ))
1511 return true; 1541 return true;
1512 1542
1543 /* hardlinked files w/ noserverino get "special" treatment */
1544 if (!(CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) &&
1545 S_ISREG(inode->i_mode) && inode->i_nlink != 1)
1546 return true;
1547
1513 return false; 1548 return false;
1514} 1549}
1515 1550
@@ -1576,9 +1611,9 @@ int cifs_revalidate_dentry(struct dentry *dentry)
1576 goto check_inval; 1611 goto check_inval;
1577 } 1612 }
1578 1613
1579 cFYI(1, ("Revalidate: %s inode 0x%p count %d dentry: 0x%p d_time %ld " 1614 cFYI(1, "Revalidate: %s inode 0x%p count %d dentry: 0x%p d_time %ld "
1580 "jiffies %ld", full_path, inode, inode->i_count.counter, 1615 "jiffies %ld", full_path, inode, inode->i_count.counter,
1581 dentry, dentry->d_time, jiffies)); 1616 dentry, dentry->d_time, jiffies);
1582 1617
1583 if (CIFS_SB(sb)->tcon->unix_ext) 1618 if (CIFS_SB(sb)->tcon->unix_ext)
1584 rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid); 1619 rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid);
@@ -1672,12 +1707,12 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
1672 rc = CIFSSMBSetFileSize(xid, pTcon, attrs->ia_size, nfid, 1707 rc = CIFSSMBSetFileSize(xid, pTcon, attrs->ia_size, nfid,
1673 npid, false); 1708 npid, false);
1674 cifsFileInfo_put(open_file); 1709 cifsFileInfo_put(open_file);
1675 cFYI(1, ("SetFSize for attrs rc = %d", rc)); 1710 cFYI(1, "SetFSize for attrs rc = %d", rc);
1676 if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) { 1711 if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
1677 unsigned int bytes_written; 1712 unsigned int bytes_written;
1678 rc = CIFSSMBWrite(xid, pTcon, nfid, 0, attrs->ia_size, 1713 rc = CIFSSMBWrite(xid, pTcon, nfid, 0, attrs->ia_size,
1679 &bytes_written, NULL, NULL, 1); 1714 &bytes_written, NULL, NULL, 1);
1680 cFYI(1, ("Wrt seteof rc %d", rc)); 1715 cFYI(1, "Wrt seteof rc %d", rc);
1681 } 1716 }
1682 } else 1717 } else
1683 rc = -EINVAL; 1718 rc = -EINVAL;
@@ -1691,7 +1726,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
1691 false, cifs_sb->local_nls, 1726 false, cifs_sb->local_nls,
1692 cifs_sb->mnt_cifs_flags & 1727 cifs_sb->mnt_cifs_flags &
1693 CIFS_MOUNT_MAP_SPECIAL_CHR); 1728 CIFS_MOUNT_MAP_SPECIAL_CHR);
1694 cFYI(1, ("SetEOF by path (setattrs) rc = %d", rc)); 1729 cFYI(1, "SetEOF by path (setattrs) rc = %d", rc);
1695 if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) { 1730 if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
1696 __u16 netfid; 1731 __u16 netfid;
1697 int oplock = 0; 1732 int oplock = 0;
@@ -1708,7 +1743,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
1708 attrs->ia_size, 1743 attrs->ia_size,
1709 &bytes_written, NULL, 1744 &bytes_written, NULL,
1710 NULL, 1); 1745 NULL, 1);
1711 cFYI(1, ("wrt seteof rc %d", rc)); 1746 cFYI(1, "wrt seteof rc %d", rc);
1712 CIFSSMBClose(xid, pTcon, netfid); 1747 CIFSSMBClose(xid, pTcon, netfid);
1713 } 1748 }
1714 } 1749 }
@@ -1736,8 +1771,8 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
1736 struct cifs_unix_set_info_args *args = NULL; 1771 struct cifs_unix_set_info_args *args = NULL;
1737 struct cifsFileInfo *open_file; 1772 struct cifsFileInfo *open_file;
1738 1773
1739 cFYI(1, ("setattr_unix on file %s attrs->ia_valid=0x%x", 1774 cFYI(1, "setattr_unix on file %s attrs->ia_valid=0x%x",
1740 direntry->d_name.name, attrs->ia_valid)); 1775 direntry->d_name.name, attrs->ia_valid);
1741 1776
1742 xid = GetXid(); 1777 xid = GetXid();
1743 1778
@@ -1867,8 +1902,8 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
1867 1902
1868 xid = GetXid(); 1903 xid = GetXid();
1869 1904
1870 cFYI(1, ("setattr on file %s attrs->iavalid 0x%x", 1905 cFYI(1, "setattr on file %s attrs->iavalid 0x%x",
1871 direntry->d_name.name, attrs->ia_valid)); 1906 direntry->d_name.name, attrs->ia_valid);
1872 1907
1873 if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) == 0) { 1908 if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) == 0) {
1874 /* check if we have permission to change attrs */ 1909 /* check if we have permission to change attrs */
@@ -1925,7 +1960,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
1925 attrs->ia_valid &= ~ATTR_MODE; 1960 attrs->ia_valid &= ~ATTR_MODE;
1926 1961
1927 if (attrs->ia_valid & ATTR_MODE) { 1962 if (attrs->ia_valid & ATTR_MODE) {
1928 cFYI(1, ("Mode changed to 0%o", attrs->ia_mode)); 1963 cFYI(1, "Mode changed to 0%o", attrs->ia_mode);
1929 mode = attrs->ia_mode; 1964 mode = attrs->ia_mode;
1930 } 1965 }
1931 1966
@@ -2011,7 +2046,7 @@ cifs_setattr(struct dentry *direntry, struct iattr *attrs)
2011#if 0 2046#if 0
2012void cifs_delete_inode(struct inode *inode) 2047void cifs_delete_inode(struct inode *inode)
2013{ 2048{
2014 cFYI(1, ("In cifs_delete_inode, inode = 0x%p", inode)); 2049 cFYI(1, "In cifs_delete_inode, inode = 0x%p", inode);
2015 /* may have to add back in if and when safe distributed caching of 2050 /* may have to add back in if and when safe distributed caching of
2016 directories added e.g. via FindNotify */ 2051 directories added e.g. via FindNotify */
2017} 2052}
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index f94650683a00..505926f1ee6b 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -47,7 +47,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
47 47
48 xid = GetXid(); 48 xid = GetXid();
49 49
50 cFYI(1, ("ioctl file %p cmd %u arg %lu", filep, command, arg)); 50 cFYI(1, "ioctl file %p cmd %u arg %lu", filep, command, arg);
51 51
52 cifs_sb = CIFS_SB(inode->i_sb); 52 cifs_sb = CIFS_SB(inode->i_sb);
53 53
@@ -64,12 +64,12 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
64 64
65 switch (command) { 65 switch (command) {
66 case CIFS_IOC_CHECKUMOUNT: 66 case CIFS_IOC_CHECKUMOUNT:
67 cFYI(1, ("User unmount attempted")); 67 cFYI(1, "User unmount attempted");
68 if (cifs_sb->mnt_uid == current_uid()) 68 if (cifs_sb->mnt_uid == current_uid())
69 rc = 0; 69 rc = 0;
70 else { 70 else {
71 rc = -EACCES; 71 rc = -EACCES;
72 cFYI(1, ("uids do not match")); 72 cFYI(1, "uids do not match");
73 } 73 }
74 break; 74 break;
75#ifdef CONFIG_CIFS_POSIX 75#ifdef CONFIG_CIFS_POSIX
@@ -97,11 +97,11 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
97 /* rc= CIFSGetExtAttr(xid,tcon,pSMBFile->netfid, 97 /* rc= CIFSGetExtAttr(xid,tcon,pSMBFile->netfid,
98 extAttrBits, &ExtAttrMask);*/ 98 extAttrBits, &ExtAttrMask);*/
99 } 99 }
100 cFYI(1, ("set flags not implemented yet")); 100 cFYI(1, "set flags not implemented yet");
101 break; 101 break;
102#endif /* CONFIG_CIFS_POSIX */ 102#endif /* CONFIG_CIFS_POSIX */
103 default: 103 default:
104 cFYI(1, ("unsupported ioctl")); 104 cFYI(1, "unsupported ioctl");
105 break; 105 break;
106 } 106 }
107 107
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index fc1e0487eaee..473ca8033656 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -20,6 +20,7 @@
20 */ 20 */
21#include <linux/fs.h> 21#include <linux/fs.h>
22#include <linux/stat.h> 22#include <linux/stat.h>
23#include <linux/slab.h>
23#include <linux/namei.h> 24#include <linux/namei.h>
24#include "cifsfs.h" 25#include "cifsfs.h"
25#include "cifspdu.h" 26#include "cifspdu.h"
@@ -138,7 +139,7 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
138 if (!full_path) 139 if (!full_path)
139 goto out; 140 goto out;
140 141
141 cFYI(1, ("Full path: %s inode = 0x%p", full_path, inode)); 142 cFYI(1, "Full path: %s inode = 0x%p", full_path, inode);
142 143
143 rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, &target_path, 144 rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, &target_path,
144 cifs_sb->local_nls); 145 cifs_sb->local_nls);
@@ -177,8 +178,8 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
177 return rc; 178 return rc;
178 } 179 }
179 180
180 cFYI(1, ("Full path: %s", full_path)); 181 cFYI(1, "Full path: %s", full_path);
181 cFYI(1, ("symname is %s", symname)); 182 cFYI(1, "symname is %s", symname);
182 183
183 /* BB what if DFS and this volume is on different share? BB */ 184 /* BB what if DFS and this volume is on different share? BB */
184 if (pTcon->unix_ext) 185 if (pTcon->unix_ext)
@@ -197,8 +198,8 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
197 inode->i_sb, xid, NULL); 198 inode->i_sb, xid, NULL);
198 199
199 if (rc != 0) { 200 if (rc != 0) {
200 cFYI(1, ("Create symlink ok, getinodeinfo fail rc = %d", 201 cFYI(1, "Create symlink ok, getinodeinfo fail rc = %d",
201 rc)); 202 rc);
202 } else { 203 } else {
203 if (pTcon->nocase) 204 if (pTcon->nocase)
204 direntry->d_op = &cifs_ci_dentry_ops; 205 direntry->d_op = &cifs_ci_dentry_ops;
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index d1474996a812..1394aa37f26c 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -51,7 +51,7 @@ _GetXid(void)
51 if (GlobalTotalActiveXid > GlobalMaxActiveXid) 51 if (GlobalTotalActiveXid > GlobalMaxActiveXid)
52 GlobalMaxActiveXid = GlobalTotalActiveXid; 52 GlobalMaxActiveXid = GlobalTotalActiveXid;
53 if (GlobalTotalActiveXid > 65000) 53 if (GlobalTotalActiveXid > 65000)
54 cFYI(1, ("warning: more than 65000 requests active")); 54 cFYI(1, "warning: more than 65000 requests active");
55 xid = GlobalCurrentXid++; 55 xid = GlobalCurrentXid++;
56 spin_unlock(&GlobalMid_Lock); 56 spin_unlock(&GlobalMid_Lock);
57 return xid; 57 return xid;
@@ -88,7 +88,7 @@ void
88sesInfoFree(struct cifsSesInfo *buf_to_free) 88sesInfoFree(struct cifsSesInfo *buf_to_free)
89{ 89{
90 if (buf_to_free == NULL) { 90 if (buf_to_free == NULL) {
91 cFYI(1, ("Null buffer passed to sesInfoFree")); 91 cFYI(1, "Null buffer passed to sesInfoFree");
92 return; 92 return;
93 } 93 }
94 94
@@ -126,7 +126,7 @@ void
126tconInfoFree(struct cifsTconInfo *buf_to_free) 126tconInfoFree(struct cifsTconInfo *buf_to_free)
127{ 127{
128 if (buf_to_free == NULL) { 128 if (buf_to_free == NULL) {
129 cFYI(1, ("Null buffer passed to tconInfoFree")); 129 cFYI(1, "Null buffer passed to tconInfoFree");
130 return; 130 return;
131 } 131 }
132 atomic_dec(&tconInfoAllocCount); 132 atomic_dec(&tconInfoAllocCount);
@@ -166,7 +166,7 @@ void
166cifs_buf_release(void *buf_to_free) 166cifs_buf_release(void *buf_to_free)
167{ 167{
168 if (buf_to_free == NULL) { 168 if (buf_to_free == NULL) {
169 /* cFYI(1, ("Null buffer passed to cifs_buf_release"));*/ 169 /* cFYI(1, "Null buffer passed to cifs_buf_release");*/
170 return; 170 return;
171 } 171 }
172 mempool_free(buf_to_free, cifs_req_poolp); 172 mempool_free(buf_to_free, cifs_req_poolp);
@@ -202,7 +202,7 @@ cifs_small_buf_release(void *buf_to_free)
202{ 202{
203 203
204 if (buf_to_free == NULL) { 204 if (buf_to_free == NULL) {
205 cFYI(1, ("Null buffer passed to cifs_small_buf_release")); 205 cFYI(1, "Null buffer passed to cifs_small_buf_release");
206 return; 206 return;
207 } 207 }
208 mempool_free(buf_to_free, cifs_sm_req_poolp); 208 mempool_free(buf_to_free, cifs_sm_req_poolp);
@@ -345,19 +345,19 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
345 /* with userid/password pairs found on the smb session */ 345 /* with userid/password pairs found on the smb session */
346 /* for other target tcp/ip addresses BB */ 346 /* for other target tcp/ip addresses BB */
347 if (current_fsuid() != treeCon->ses->linux_uid) { 347 if (current_fsuid() != treeCon->ses->linux_uid) {
348 cFYI(1, ("Multiuser mode and UID " 348 cFYI(1, "Multiuser mode and UID "
349 "did not match tcon uid")); 349 "did not match tcon uid");
350 read_lock(&cifs_tcp_ses_lock); 350 read_lock(&cifs_tcp_ses_lock);
351 list_for_each(temp_item, &treeCon->ses->server->smb_ses_list) { 351 list_for_each(temp_item, &treeCon->ses->server->smb_ses_list) {
352 ses = list_entry(temp_item, struct cifsSesInfo, smb_ses_list); 352 ses = list_entry(temp_item, struct cifsSesInfo, smb_ses_list);
353 if (ses->linux_uid == current_fsuid()) { 353 if (ses->linux_uid == current_fsuid()) {
354 if (ses->server == treeCon->ses->server) { 354 if (ses->server == treeCon->ses->server) {
355 cFYI(1, ("found matching uid substitute right smb_uid")); 355 cFYI(1, "found matching uid substitute right smb_uid");
356 buffer->Uid = ses->Suid; 356 buffer->Uid = ses->Suid;
357 break; 357 break;
358 } else { 358 } else {
359 /* BB eventually call cifs_setup_session here */ 359 /* BB eventually call cifs_setup_session here */
360 cFYI(1, ("local UID found but no smb sess with this server exists")); 360 cFYI(1, "local UID found but no smb sess with this server exists");
361 } 361 }
362 } 362 }
363 } 363 }
@@ -394,17 +394,16 @@ checkSMBhdr(struct smb_hdr *smb, __u16 mid)
394 if (smb->Command == SMB_COM_LOCKING_ANDX) 394 if (smb->Command == SMB_COM_LOCKING_ANDX)
395 return 0; 395 return 0;
396 else 396 else
397 cERROR(1, ("Received Request not response")); 397 cERROR(1, "Received Request not response");
398 } 398 }
399 } else { /* bad signature or mid */ 399 } else { /* bad signature or mid */
400 if (*(__le32 *) smb->Protocol != cpu_to_le32(0x424d53ff)) 400 if (*(__le32 *) smb->Protocol != cpu_to_le32(0x424d53ff))
401 cERROR(1, 401 cERROR(1, "Bad protocol string signature header %x",
402 ("Bad protocol string signature header %x", 402 *(unsigned int *) smb->Protocol);
403 *(unsigned int *) smb->Protocol));
404 if (mid != smb->Mid) 403 if (mid != smb->Mid)
405 cERROR(1, ("Mids do not match")); 404 cERROR(1, "Mids do not match");
406 } 405 }
407 cERROR(1, ("bad smb detected. The Mid=%d", smb->Mid)); 406 cERROR(1, "bad smb detected. The Mid=%d", smb->Mid);
408 return 1; 407 return 1;
409} 408}
410 409
@@ -413,7 +412,7 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
413{ 412{
414 __u32 len = smb->smb_buf_length; 413 __u32 len = smb->smb_buf_length;
415 __u32 clc_len; /* calculated length */ 414 __u32 clc_len; /* calculated length */
416 cFYI(0, ("checkSMB Length: 0x%x, smb_buf_length: 0x%x", length, len)); 415 cFYI(0, "checkSMB Length: 0x%x, smb_buf_length: 0x%x", length, len);
417 416
418 if (length < 2 + sizeof(struct smb_hdr)) { 417 if (length < 2 + sizeof(struct smb_hdr)) {
419 if ((length >= sizeof(struct smb_hdr) - 1) 418 if ((length >= sizeof(struct smb_hdr) - 1)
@@ -437,15 +436,15 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
437 tmp[sizeof(struct smb_hdr)+1] = 0; 436 tmp[sizeof(struct smb_hdr)+1] = 0;
438 return 0; 437 return 0;
439 } 438 }
440 cERROR(1, ("rcvd invalid byte count (bcc)")); 439 cERROR(1, "rcvd invalid byte count (bcc)");
441 } else { 440 } else {
442 cERROR(1, ("Length less than smb header size")); 441 cERROR(1, "Length less than smb header size");
443 } 442 }
444 return 1; 443 return 1;
445 } 444 }
446 if (len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) { 445 if (len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
447 cERROR(1, ("smb length greater than MaxBufSize, mid=%d", 446 cERROR(1, "smb length greater than MaxBufSize, mid=%d",
448 smb->Mid)); 447 smb->Mid);
449 return 1; 448 return 1;
450 } 449 }
451 450
@@ -454,8 +453,8 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
454 clc_len = smbCalcSize_LE(smb); 453 clc_len = smbCalcSize_LE(smb);
455 454
456 if (4 + len != length) { 455 if (4 + len != length) {
457 cERROR(1, ("Length read does not match RFC1001 length %d", 456 cERROR(1, "Length read does not match RFC1001 length %d",
458 len)); 457 len);
459 return 1; 458 return 1;
460 } 459 }
461 460
@@ -466,8 +465,8 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
466 if (((4 + len) & 0xFFFF) == (clc_len & 0xFFFF)) 465 if (((4 + len) & 0xFFFF) == (clc_len & 0xFFFF))
467 return 0; /* bcc wrapped */ 466 return 0; /* bcc wrapped */
468 } 467 }
469 cFYI(1, ("Calculated size %d vs length %d mismatch for mid %d", 468 cFYI(1, "Calculated size %d vs length %d mismatch for mid %d",
470 clc_len, 4 + len, smb->Mid)); 469 clc_len, 4 + len, smb->Mid);
471 /* Windows XP can return a few bytes too much, presumably 470 /* Windows XP can return a few bytes too much, presumably
472 an illegal pad, at the end of byte range lock responses 471 an illegal pad, at the end of byte range lock responses
473 so we allow for that three byte pad, as long as actual 472 so we allow for that three byte pad, as long as actual
@@ -482,8 +481,8 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
482 if ((4+len > clc_len) && (len <= clc_len + 512)) 481 if ((4+len > clc_len) && (len <= clc_len + 512))
483 return 0; 482 return 0;
484 else { 483 else {
485 cERROR(1, ("RFC1001 size %d bigger than SMB for Mid=%d", 484 cERROR(1, "RFC1001 size %d bigger than SMB for Mid=%d",
486 len, smb->Mid)); 485 len, smb->Mid);
487 return 1; 486 return 1;
488 } 487 }
489 } 488 }
@@ -501,7 +500,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
501 struct cifsFileInfo *netfile; 500 struct cifsFileInfo *netfile;
502 int rc; 501 int rc;
503 502
504 cFYI(1, ("Checking for oplock break or dnotify response")); 503 cFYI(1, "Checking for oplock break or dnotify response");
505 if ((pSMB->hdr.Command == SMB_COM_NT_TRANSACT) && 504 if ((pSMB->hdr.Command == SMB_COM_NT_TRANSACT) &&
506 (pSMB->hdr.Flags & SMBFLG_RESPONSE)) { 505 (pSMB->hdr.Flags & SMBFLG_RESPONSE)) {
507 struct smb_com_transaction_change_notify_rsp *pSMBr = 506 struct smb_com_transaction_change_notify_rsp *pSMBr =
@@ -513,15 +512,15 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
513 512
514 pnotify = (struct file_notify_information *) 513 pnotify = (struct file_notify_information *)
515 ((char *)&pSMBr->hdr.Protocol + data_offset); 514 ((char *)&pSMBr->hdr.Protocol + data_offset);
516 cFYI(1, ("dnotify on %s Action: 0x%x", 515 cFYI(1, "dnotify on %s Action: 0x%x",
517 pnotify->FileName, pnotify->Action)); 516 pnotify->FileName, pnotify->Action);
518 /* cifs_dump_mem("Rcvd notify Data: ",buf, 517 /* cifs_dump_mem("Rcvd notify Data: ",buf,
519 sizeof(struct smb_hdr)+60); */ 518 sizeof(struct smb_hdr)+60); */
520 return true; 519 return true;
521 } 520 }
522 if (pSMBr->hdr.Status.CifsError) { 521 if (pSMBr->hdr.Status.CifsError) {
523 cFYI(1, ("notify err 0x%d", 522 cFYI(1, "notify err 0x%d",
524 pSMBr->hdr.Status.CifsError)); 523 pSMBr->hdr.Status.CifsError);
525 return true; 524 return true;
526 } 525 }
527 return false; 526 return false;
@@ -535,7 +534,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
535 large dirty files cached on the client */ 534 large dirty files cached on the client */
536 if ((NT_STATUS_INVALID_HANDLE) == 535 if ((NT_STATUS_INVALID_HANDLE) ==
537 le32_to_cpu(pSMB->hdr.Status.CifsError)) { 536 le32_to_cpu(pSMB->hdr.Status.CifsError)) {
538 cFYI(1, ("invalid handle on oplock break")); 537 cFYI(1, "invalid handle on oplock break");
539 return true; 538 return true;
540 } else if (ERRbadfid == 539 } else if (ERRbadfid ==
541 le16_to_cpu(pSMB->hdr.Status.DosError.Error)) { 540 le16_to_cpu(pSMB->hdr.Status.DosError.Error)) {
@@ -547,8 +546,8 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
547 if (pSMB->hdr.WordCount != 8) 546 if (pSMB->hdr.WordCount != 8)
548 return false; 547 return false;
549 548
550 cFYI(1, ("oplock type 0x%d level 0x%d", 549 cFYI(1, "oplock type 0x%d level 0x%d",
551 pSMB->LockType, pSMB->OplockLevel)); 550 pSMB->LockType, pSMB->OplockLevel);
552 if (!(pSMB->LockType & LOCKING_ANDX_OPLOCK_RELEASE)) 551 if (!(pSMB->LockType & LOCKING_ANDX_OPLOCK_RELEASE))
553 return false; 552 return false;
554 553
@@ -579,15 +578,15 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
579 return true; 578 return true;
580 } 579 }
581 580
582 cFYI(1, ("file id match, oplock break")); 581 cFYI(1, "file id match, oplock break");
583 pCifsInode = CIFS_I(netfile->pInode); 582 pCifsInode = CIFS_I(netfile->pInode);
584 pCifsInode->clientCanCacheAll = false; 583 pCifsInode->clientCanCacheAll = false;
585 if (pSMB->OplockLevel == 0) 584 if (pSMB->OplockLevel == 0)
586 pCifsInode->clientCanCacheRead = false; 585 pCifsInode->clientCanCacheRead = false;
587 rc = slow_work_enqueue(&netfile->oplock_break); 586 rc = slow_work_enqueue(&netfile->oplock_break);
588 if (rc) { 587 if (rc) {
589 cERROR(1, ("failed to enqueue oplock " 588 cERROR(1, "failed to enqueue oplock "
590 "break: %d\n", rc)); 589 "break: %d\n", rc);
591 } else { 590 } else {
592 netfile->oplock_break_cancelled = false; 591 netfile->oplock_break_cancelled = false;
593 } 592 }
@@ -597,12 +596,12 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
597 } 596 }
598 read_unlock(&GlobalSMBSeslock); 597 read_unlock(&GlobalSMBSeslock);
599 read_unlock(&cifs_tcp_ses_lock); 598 read_unlock(&cifs_tcp_ses_lock);
600 cFYI(1, ("No matching file for oplock break")); 599 cFYI(1, "No matching file for oplock break");
601 return true; 600 return true;
602 } 601 }
603 } 602 }
604 read_unlock(&cifs_tcp_ses_lock); 603 read_unlock(&cifs_tcp_ses_lock);
605 cFYI(1, ("Can not process oplock break for non-existent connection")); 604 cFYI(1, "Can not process oplock break for non-existent connection");
606 return true; 605 return true;
607} 606}
608 607
@@ -721,11 +720,11 @@ cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb)
721{ 720{
722 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) { 721 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
723 cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM; 722 cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM;
724 cERROR(1, ("Autodisabling the use of server inode numbers on " 723 cERROR(1, "Autodisabling the use of server inode numbers on "
725 "%s. This server doesn't seem to support them " 724 "%s. This server doesn't seem to support them "
726 "properly. Hardlinks will not be recognized on this " 725 "properly. Hardlinks will not be recognized on this "
727 "mount. Consider mounting with the \"noserverino\" " 726 "mount. Consider mounting with the \"noserverino\" "
728 "option to silence this message.", 727 "option to silence this message.",
729 cifs_sb->tcon->treeName)); 728 cifs_sb->tcon->treeName);
730 } 729 }
731} 730}
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index bd6d6895730d..d35d52889cb5 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -149,7 +149,7 @@ cifs_inet_pton(const int address_family, const char *cp, void *dst)
149 else if (address_family == AF_INET6) 149 else if (address_family == AF_INET6)
150 ret = in6_pton(cp, -1 /* len */, dst , '\\', NULL); 150 ret = in6_pton(cp, -1 /* len */, dst , '\\', NULL);
151 151
152 cFYI(DBG2, ("address conversion returned %d for %s", ret, cp)); 152 cFYI(DBG2, "address conversion returned %d for %s", ret, cp);
153 if (ret > 0) 153 if (ret > 0)
154 ret = 1; 154 ret = 1;
155 return ret; 155 return ret;
@@ -870,8 +870,8 @@ map_smb_to_linux_error(struct smb_hdr *smb, int logErr)
870 } 870 }
871 /* else ERRHRD class errors or junk - return EIO */ 871 /* else ERRHRD class errors or junk - return EIO */
872 872
873 cFYI(1, ("Mapping smb error code %d to POSIX err %d", 873 cFYI(1, "Mapping smb error code %d to POSIX err %d",
874 smberrcode, rc)); 874 smberrcode, rc);
875 875
876 /* generic corrective action e.g. reconnect SMB session on 876 /* generic corrective action e.g. reconnect SMB session on
877 * ERRbaduid could be added */ 877 * ERRbaduid could be added */
@@ -940,20 +940,20 @@ struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset)
940 SMB_TIME *st = (SMB_TIME *)&time; 940 SMB_TIME *st = (SMB_TIME *)&time;
941 SMB_DATE *sd = (SMB_DATE *)&date; 941 SMB_DATE *sd = (SMB_DATE *)&date;
942 942
943 cFYI(1, ("date %d time %d", date, time)); 943 cFYI(1, "date %d time %d", date, time);
944 944
945 sec = 2 * st->TwoSeconds; 945 sec = 2 * st->TwoSeconds;
946 min = st->Minutes; 946 min = st->Minutes;
947 if ((sec > 59) || (min > 59)) 947 if ((sec > 59) || (min > 59))
948 cERROR(1, ("illegal time min %d sec %d", min, sec)); 948 cERROR(1, "illegal time min %d sec %d", min, sec);
949 sec += (min * 60); 949 sec += (min * 60);
950 sec += 60 * 60 * st->Hours; 950 sec += 60 * 60 * st->Hours;
951 if (st->Hours > 24) 951 if (st->Hours > 24)
952 cERROR(1, ("illegal hours %d", st->Hours)); 952 cERROR(1, "illegal hours %d", st->Hours);
953 days = sd->Day; 953 days = sd->Day;
954 month = sd->Month; 954 month = sd->Month;
955 if ((days > 31) || (month > 12)) { 955 if ((days > 31) || (month > 12)) {
956 cERROR(1, ("illegal date, month %d day: %d", month, days)); 956 cERROR(1, "illegal date, month %d day: %d", month, days);
957 if (month > 12) 957 if (month > 12)
958 month = 12; 958 month = 12;
959 } 959 }
@@ -979,7 +979,7 @@ struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset)
979 979
980 ts.tv_sec = sec + offset; 980 ts.tv_sec = sec + offset;
981 981
982 /* cFYI(1,("sec after cnvrt dos to unix time %d",sec)); */ 982 /* cFYI(1, "sec after cnvrt dos to unix time %d",sec); */
983 983
984 ts.tv_nsec = 0; 984 ts.tv_nsec = 0;
985 return ts; 985 return ts;
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index c343b14ba2d3..daf1753af674 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -22,6 +22,7 @@
22 */ 22 */
23#include <linux/fs.h> 23#include <linux/fs.h>
24#include <linux/pagemap.h> 24#include <linux/pagemap.h>
25#include <linux/slab.h>
25#include <linux/stat.h> 26#include <linux/stat.h>
26#include "cifspdu.h" 27#include "cifspdu.h"
27#include "cifsglob.h" 28#include "cifsglob.h"
@@ -46,15 +47,15 @@ static void dump_cifs_file_struct(struct file *file, char *label)
46 if (file) { 47 if (file) {
47 cf = file->private_data; 48 cf = file->private_data;
48 if (cf == NULL) { 49 if (cf == NULL) {
49 cFYI(1, ("empty cifs private file data")); 50 cFYI(1, "empty cifs private file data");
50 return; 51 return;
51 } 52 }
52 if (cf->invalidHandle) 53 if (cf->invalidHandle)
53 cFYI(1, ("invalid handle")); 54 cFYI(1, "invalid handle");
54 if (cf->srch_inf.endOfSearch) 55 if (cf->srch_inf.endOfSearch)
55 cFYI(1, ("end of search")); 56 cFYI(1, "end of search");
56 if (cf->srch_inf.emptyDir) 57 if (cf->srch_inf.emptyDir)
57 cFYI(1, ("empty dir")); 58 cFYI(1, "empty dir");
58 } 59 }
59} 60}
60#else 61#else
@@ -75,7 +76,7 @@ cifs_readdir_lookup(struct dentry *parent, struct qstr *name,
75 struct inode *inode; 76 struct inode *inode;
76 struct super_block *sb = parent->d_inode->i_sb; 77 struct super_block *sb = parent->d_inode->i_sb;
77 78
78 cFYI(1, ("For %s", name->name)); 79 cFYI(1, "For %s", name->name);
79 80
80 if (parent->d_op && parent->d_op->d_hash) 81 if (parent->d_op && parent->d_op->d_hash)
81 parent->d_op->d_hash(parent, name); 82 parent->d_op->d_hash(parent, name);
@@ -213,7 +214,7 @@ int get_symlink_reparse_path(char *full_path, struct cifs_sb_info *cifs_sb,
213 fid, 214 fid,
214 cifs_sb->local_nls); 215 cifs_sb->local_nls);
215 if (CIFSSMBClose(xid, ptcon, fid)) { 216 if (CIFSSMBClose(xid, ptcon, fid)) {
216 cFYI(1, ("Error closing temporary reparsepoint open)")); 217 cFYI(1, "Error closing temporary reparsepoint open");
217 } 218 }
218 } 219 }
219} 220}
@@ -251,7 +252,7 @@ static int initiate_cifs_search(const int xid, struct file *file)
251 if (full_path == NULL) 252 if (full_path == NULL)
252 return -ENOMEM; 253 return -ENOMEM;
253 254
254 cFYI(1, ("Full path: %s start at: %lld", full_path, file->f_pos)); 255 cFYI(1, "Full path: %s start at: %lld", full_path, file->f_pos);
255 256
256ffirst_retry: 257ffirst_retry:
257 /* test for Unix extensions */ 258 /* test for Unix extensions */
@@ -296,7 +297,7 @@ static int cifs_unicode_bytelen(char *str)
296 if (ustr[len] == 0) 297 if (ustr[len] == 0)
297 return len << 1; 298 return len << 1;
298 } 299 }
299 cFYI(1, ("Unicode string longer than PATH_MAX found")); 300 cFYI(1, "Unicode string longer than PATH_MAX found");
300 return len << 1; 301 return len << 1;
301} 302}
302 303
@@ -313,19 +314,18 @@ static char *nxt_dir_entry(char *old_entry, char *end_of_smb, int level)
313 pfData->FileNameLength; 314 pfData->FileNameLength;
314 } else 315 } else
315 new_entry = old_entry + le32_to_cpu(pDirInfo->NextEntryOffset); 316 new_entry = old_entry + le32_to_cpu(pDirInfo->NextEntryOffset);
316 cFYI(1, ("new entry %p old entry %p", new_entry, old_entry)); 317 cFYI(1, "new entry %p old entry %p", new_entry, old_entry);
317 /* validate that new_entry is not past end of SMB */ 318 /* validate that new_entry is not past end of SMB */
318 if (new_entry >= end_of_smb) { 319 if (new_entry >= end_of_smb) {
319 cERROR(1, 320 cERROR(1, "search entry %p began after end of SMB %p old entry %p",
320 ("search entry %p began after end of SMB %p old entry %p", 321 new_entry, end_of_smb, old_entry);
321 new_entry, end_of_smb, old_entry));
322 return NULL; 322 return NULL;
323 } else if (((level == SMB_FIND_FILE_INFO_STANDARD) && 323 } else if (((level == SMB_FIND_FILE_INFO_STANDARD) &&
324 (new_entry + sizeof(FIND_FILE_STANDARD_INFO) > end_of_smb)) 324 (new_entry + sizeof(FIND_FILE_STANDARD_INFO) > end_of_smb))
325 || ((level != SMB_FIND_FILE_INFO_STANDARD) && 325 || ((level != SMB_FIND_FILE_INFO_STANDARD) &&
326 (new_entry + sizeof(FILE_DIRECTORY_INFO) > end_of_smb))) { 326 (new_entry + sizeof(FILE_DIRECTORY_INFO) > end_of_smb))) {
327 cERROR(1, ("search entry %p extends after end of SMB %p", 327 cERROR(1, "search entry %p extends after end of SMB %p",
328 new_entry, end_of_smb)); 328 new_entry, end_of_smb);
329 return NULL; 329 return NULL;
330 } else 330 } else
331 return new_entry; 331 return new_entry;
@@ -379,8 +379,8 @@ static int cifs_entry_is_dot(char *current_entry, struct cifsFileInfo *cfile)
379 filename = &pFindData->FileName[0]; 379 filename = &pFindData->FileName[0];
380 len = pFindData->FileNameLength; 380 len = pFindData->FileNameLength;
381 } else { 381 } else {
382 cFYI(1, ("Unknown findfirst level %d", 382 cFYI(1, "Unknown findfirst level %d",
383 cfile->srch_inf.info_level)); 383 cfile->srch_inf.info_level);
384 } 384 }
385 385
386 if (filename) { 386 if (filename) {
@@ -480,7 +480,7 @@ static int cifs_save_resume_key(const char *current_entry,
480 len = (unsigned int)pFindData->FileNameLength; 480 len = (unsigned int)pFindData->FileNameLength;
481 cifsFile->srch_inf.resume_key = pFindData->ResumeKey; 481 cifsFile->srch_inf.resume_key = pFindData->ResumeKey;
482 } else { 482 } else {
483 cFYI(1, ("Unknown findfirst level %d", level)); 483 cFYI(1, "Unknown findfirst level %d", level);
484 return -EINVAL; 484 return -EINVAL;
485 } 485 }
486 cifsFile->srch_inf.resume_name_len = len; 486 cifsFile->srch_inf.resume_name_len = len;
@@ -524,7 +524,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
524 is_dir_changed(file)) || 524 is_dir_changed(file)) ||
525 (index_to_find < first_entry_in_buffer)) { 525 (index_to_find < first_entry_in_buffer)) {
526 /* close and restart search */ 526 /* close and restart search */
527 cFYI(1, ("search backing up - close and restart search")); 527 cFYI(1, "search backing up - close and restart search");
528 write_lock(&GlobalSMBSeslock); 528 write_lock(&GlobalSMBSeslock);
529 if (!cifsFile->srch_inf.endOfSearch && 529 if (!cifsFile->srch_inf.endOfSearch &&
530 !cifsFile->invalidHandle) { 530 !cifsFile->invalidHandle) {
@@ -534,7 +534,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
534 } else 534 } else
535 write_unlock(&GlobalSMBSeslock); 535 write_unlock(&GlobalSMBSeslock);
536 if (cifsFile->srch_inf.ntwrk_buf_start) { 536 if (cifsFile->srch_inf.ntwrk_buf_start) {
537 cFYI(1, ("freeing SMB ff cache buf on search rewind")); 537 cFYI(1, "freeing SMB ff cache buf on search rewind");
538 if (cifsFile->srch_inf.smallBuf) 538 if (cifsFile->srch_inf.smallBuf)
539 cifs_small_buf_release(cifsFile->srch_inf. 539 cifs_small_buf_release(cifsFile->srch_inf.
540 ntwrk_buf_start); 540 ntwrk_buf_start);
@@ -545,8 +545,8 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
545 } 545 }
546 rc = initiate_cifs_search(xid, file); 546 rc = initiate_cifs_search(xid, file);
547 if (rc) { 547 if (rc) {
548 cFYI(1, ("error %d reinitiating a search on rewind", 548 cFYI(1, "error %d reinitiating a search on rewind",
549 rc)); 549 rc);
550 return rc; 550 return rc;
551 } 551 }
552 cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile); 552 cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile);
@@ -554,7 +554,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
554 554
555 while ((index_to_find >= cifsFile->srch_inf.index_of_last_entry) && 555 while ((index_to_find >= cifsFile->srch_inf.index_of_last_entry) &&
556 (rc == 0) && !cifsFile->srch_inf.endOfSearch) { 556 (rc == 0) && !cifsFile->srch_inf.endOfSearch) {
557 cFYI(1, ("calling findnext2")); 557 cFYI(1, "calling findnext2");
558 rc = CIFSFindNext(xid, pTcon, cifsFile->netfid, 558 rc = CIFSFindNext(xid, pTcon, cifsFile->netfid,
559 &cifsFile->srch_inf); 559 &cifsFile->srch_inf);
560 cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile); 560 cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile);
@@ -574,7 +574,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
574 first_entry_in_buffer = cifsFile->srch_inf.index_of_last_entry 574 first_entry_in_buffer = cifsFile->srch_inf.index_of_last_entry
575 - cifsFile->srch_inf.entries_in_buffer; 575 - cifsFile->srch_inf.entries_in_buffer;
576 pos_in_buf = index_to_find - first_entry_in_buffer; 576 pos_in_buf = index_to_find - first_entry_in_buffer;
577 cFYI(1, ("found entry - pos_in_buf %d", pos_in_buf)); 577 cFYI(1, "found entry - pos_in_buf %d", pos_in_buf);
578 578
579 for (i = 0; (i < (pos_in_buf)) && (current_entry != NULL); i++) { 579 for (i = 0; (i < (pos_in_buf)) && (current_entry != NULL); i++) {
580 /* go entry by entry figuring out which is first */ 580 /* go entry by entry figuring out which is first */
@@ -583,19 +583,19 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
583 } 583 }
584 if ((current_entry == NULL) && (i < pos_in_buf)) { 584 if ((current_entry == NULL) && (i < pos_in_buf)) {
585 /* BB fixme - check if we should flag this error */ 585 /* BB fixme - check if we should flag this error */
586 cERROR(1, ("reached end of buf searching for pos in buf" 586 cERROR(1, "reached end of buf searching for pos in buf"
587 " %d index to find %lld rc %d", 587 " %d index to find %lld rc %d",
588 pos_in_buf, index_to_find, rc)); 588 pos_in_buf, index_to_find, rc);
589 } 589 }
590 rc = 0; 590 rc = 0;
591 *ppCurrentEntry = current_entry; 591 *ppCurrentEntry = current_entry;
592 } else { 592 } else {
593 cFYI(1, ("index not in buffer - could not findnext into it")); 593 cFYI(1, "index not in buffer - could not findnext into it");
594 return 0; 594 return 0;
595 } 595 }
596 596
597 if (pos_in_buf >= cifsFile->srch_inf.entries_in_buffer) { 597 if (pos_in_buf >= cifsFile->srch_inf.entries_in_buffer) {
598 cFYI(1, ("can not return entries pos_in_buf beyond last")); 598 cFYI(1, "can not return entries pos_in_buf beyond last");
599 *num_to_ret = 0; 599 *num_to_ret = 0;
600 } else 600 } else
601 *num_to_ret = cifsFile->srch_inf.entries_in_buffer - pos_in_buf; 601 *num_to_ret = cifsFile->srch_inf.entries_in_buffer - pos_in_buf;
@@ -655,12 +655,12 @@ static int cifs_get_name_from_search_buf(struct qstr *pqst,
655 /* one byte length, no name conversion */ 655 /* one byte length, no name conversion */
656 len = (unsigned int)pFindData->FileNameLength; 656 len = (unsigned int)pFindData->FileNameLength;
657 } else { 657 } else {
658 cFYI(1, ("Unknown findfirst level %d", level)); 658 cFYI(1, "Unknown findfirst level %d", level);
659 return -EINVAL; 659 return -EINVAL;
660 } 660 }
661 661
662 if (len > max_len) { 662 if (len > max_len) {
663 cERROR(1, ("bad search response length %d past smb end", len)); 663 cERROR(1, "bad search response length %d past smb end", len);
664 return -EINVAL; 664 return -EINVAL;
665 } 665 }
666 666
@@ -753,7 +753,7 @@ static int cifs_filldir(char *pfindEntry, struct file *file, filldir_t filldir,
753 * case already. Why should we be clobbering other errors from it? 753 * case already. Why should we be clobbering other errors from it?
754 */ 754 */
755 if (rc) { 755 if (rc) {
756 cFYI(1, ("filldir rc = %d", rc)); 756 cFYI(1, "filldir rc = %d", rc);
757 rc = -EOVERFLOW; 757 rc = -EOVERFLOW;
758 } 758 }
759 dput(tmp_dentry); 759 dput(tmp_dentry);
@@ -785,7 +785,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
785 case 0: 785 case 0:
786 if (filldir(direntry, ".", 1, file->f_pos, 786 if (filldir(direntry, ".", 1, file->f_pos,
787 file->f_path.dentry->d_inode->i_ino, DT_DIR) < 0) { 787 file->f_path.dentry->d_inode->i_ino, DT_DIR) < 0) {
788 cERROR(1, ("Filldir for current dir failed")); 788 cERROR(1, "Filldir for current dir failed");
789 rc = -ENOMEM; 789 rc = -ENOMEM;
790 break; 790 break;
791 } 791 }
@@ -793,7 +793,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
793 case 1: 793 case 1:
794 if (filldir(direntry, "..", 2, file->f_pos, 794 if (filldir(direntry, "..", 2, file->f_pos,
795 file->f_path.dentry->d_parent->d_inode->i_ino, DT_DIR) < 0) { 795 file->f_path.dentry->d_parent->d_inode->i_ino, DT_DIR) < 0) {
796 cERROR(1, ("Filldir for parent dir failed")); 796 cERROR(1, "Filldir for parent dir failed");
797 rc = -ENOMEM; 797 rc = -ENOMEM;
798 break; 798 break;
799 } 799 }
@@ -806,7 +806,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
806 806
807 if (file->private_data == NULL) { 807 if (file->private_data == NULL) {
808 rc = initiate_cifs_search(xid, file); 808 rc = initiate_cifs_search(xid, file);
809 cFYI(1, ("initiate cifs search rc %d", rc)); 809 cFYI(1, "initiate cifs search rc %d", rc);
810 if (rc) { 810 if (rc) {
811 FreeXid(xid); 811 FreeXid(xid);
812 return rc; 812 return rc;
@@ -820,7 +820,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
820 cifsFile = file->private_data; 820 cifsFile = file->private_data;
821 if (cifsFile->srch_inf.endOfSearch) { 821 if (cifsFile->srch_inf.endOfSearch) {
822 if (cifsFile->srch_inf.emptyDir) { 822 if (cifsFile->srch_inf.emptyDir) {
823 cFYI(1, ("End of search, empty dir")); 823 cFYI(1, "End of search, empty dir");
824 rc = 0; 824 rc = 0;
825 break; 825 break;
826 } 826 }
@@ -832,16 +832,16 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
832 rc = find_cifs_entry(xid, pTcon, file, 832 rc = find_cifs_entry(xid, pTcon, file,
833 &current_entry, &num_to_fill); 833 &current_entry, &num_to_fill);
834 if (rc) { 834 if (rc) {
835 cFYI(1, ("fce error %d", rc)); 835 cFYI(1, "fce error %d", rc);
836 goto rddir2_exit; 836 goto rddir2_exit;
837 } else if (current_entry != NULL) { 837 } else if (current_entry != NULL) {
838 cFYI(1, ("entry %lld found", file->f_pos)); 838 cFYI(1, "entry %lld found", file->f_pos);
839 } else { 839 } else {
840 cFYI(1, ("could not find entry")); 840 cFYI(1, "could not find entry");
841 goto rddir2_exit; 841 goto rddir2_exit;
842 } 842 }
843 cFYI(1, ("loop through %d times filling dir for net buf %p", 843 cFYI(1, "loop through %d times filling dir for net buf %p",
844 num_to_fill, cifsFile->srch_inf.ntwrk_buf_start)); 844 num_to_fill, cifsFile->srch_inf.ntwrk_buf_start);
845 max_len = smbCalcSize((struct smb_hdr *) 845 max_len = smbCalcSize((struct smb_hdr *)
846 cifsFile->srch_inf.ntwrk_buf_start); 846 cifsFile->srch_inf.ntwrk_buf_start);
847 end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len; 847 end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len;
@@ -850,8 +850,8 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
850 for (i = 0; (i < num_to_fill) && (rc == 0); i++) { 850 for (i = 0; (i < num_to_fill) && (rc == 0); i++) {
851 if (current_entry == NULL) { 851 if (current_entry == NULL) {
852 /* evaluate whether this case is an error */ 852 /* evaluate whether this case is an error */
853 cERROR(1, ("past SMB end, num to fill %d i %d", 853 cERROR(1, "past SMB end, num to fill %d i %d",
854 num_to_fill, i)); 854 num_to_fill, i);
855 break; 855 break;
856 } 856 }
857 /* if buggy server returns . and .. late do 857 /* if buggy server returns . and .. late do
@@ -866,8 +866,8 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
866 file->f_pos++; 866 file->f_pos++;
867 if (file->f_pos == 867 if (file->f_pos ==
868 cifsFile->srch_inf.index_of_last_entry) { 868 cifsFile->srch_inf.index_of_last_entry) {
869 cFYI(1, ("last entry in buf at pos %lld %s", 869 cFYI(1, "last entry in buf at pos %lld %s",
870 file->f_pos, tmp_buf)); 870 file->f_pos, tmp_buf);
871 cifs_save_resume_key(current_entry, cifsFile); 871 cifs_save_resume_key(current_entry, cifsFile);
872 break; 872 break;
873 } else 873 } else
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index aaa9c1c5a5bd..7707389bdf2c 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -29,14 +29,17 @@
29#include "ntlmssp.h" 29#include "ntlmssp.h"
30#include "nterr.h" 30#include "nterr.h"
31#include <linux/utsname.h> 31#include <linux/utsname.h>
32#include <linux/slab.h>
32#include "cifs_spnego.h" 33#include "cifs_spnego.h"
33 34
34extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8, 35extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8,
35 unsigned char *p24); 36 unsigned char *p24);
36 37
37/* Checks if this is the first smb session to be reconnected after 38/*
38 the socket has been reestablished (so we know whether to use vc 0). 39 * Checks if this is the first smb session to be reconnected after
39 Called while holding the cifs_tcp_ses_lock, so do not block */ 40 * the socket has been reestablished (so we know whether to use vc 0).
41 * Called while holding the cifs_tcp_ses_lock, so do not block
42 */
40static bool is_first_ses_reconnect(struct cifsSesInfo *ses) 43static bool is_first_ses_reconnect(struct cifsSesInfo *ses)
41{ 44{
42 struct list_head *tmp; 45 struct list_head *tmp;
@@ -283,7 +286,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifsSesInfo *ses,
283 int len; 286 int len;
284 char *data = *pbcc_area; 287 char *data = *pbcc_area;
285 288
286 cFYI(1, ("bleft %d", bleft)); 289 cFYI(1, "bleft %d", bleft);
287 290
288 /* 291 /*
289 * Windows servers do not always double null terminate their final 292 * Windows servers do not always double null terminate their final
@@ -300,7 +303,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifsSesInfo *ses,
300 303
301 kfree(ses->serverOS); 304 kfree(ses->serverOS);
302 ses->serverOS = cifs_strndup_from_ucs(data, bleft, true, nls_cp); 305 ses->serverOS = cifs_strndup_from_ucs(data, bleft, true, nls_cp);
303 cFYI(1, ("serverOS=%s", ses->serverOS)); 306 cFYI(1, "serverOS=%s", ses->serverOS);
304 len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2; 307 len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2;
305 data += len; 308 data += len;
306 bleft -= len; 309 bleft -= len;
@@ -309,7 +312,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifsSesInfo *ses,
309 312
310 kfree(ses->serverNOS); 313 kfree(ses->serverNOS);
311 ses->serverNOS = cifs_strndup_from_ucs(data, bleft, true, nls_cp); 314 ses->serverNOS = cifs_strndup_from_ucs(data, bleft, true, nls_cp);
312 cFYI(1, ("serverNOS=%s", ses->serverNOS)); 315 cFYI(1, "serverNOS=%s", ses->serverNOS);
313 len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2; 316 len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2;
314 data += len; 317 data += len;
315 bleft -= len; 318 bleft -= len;
@@ -318,7 +321,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifsSesInfo *ses,
318 321
319 kfree(ses->serverDomain); 322 kfree(ses->serverDomain);
320 ses->serverDomain = cifs_strndup_from_ucs(data, bleft, true, nls_cp); 323 ses->serverDomain = cifs_strndup_from_ucs(data, bleft, true, nls_cp);
321 cFYI(1, ("serverDomain=%s", ses->serverDomain)); 324 cFYI(1, "serverDomain=%s", ses->serverDomain);
322 325
323 return; 326 return;
324} 327}
@@ -331,7 +334,7 @@ static int decode_ascii_ssetup(char **pbcc_area, int bleft,
331 int len; 334 int len;
332 char *bcc_ptr = *pbcc_area; 335 char *bcc_ptr = *pbcc_area;
333 336
334 cFYI(1, ("decode sessetup ascii. bleft %d", bleft)); 337 cFYI(1, "decode sessetup ascii. bleft %d", bleft);
335 338
336 len = strnlen(bcc_ptr, bleft); 339 len = strnlen(bcc_ptr, bleft);
337 if (len >= bleft) 340 if (len >= bleft)
@@ -343,7 +346,7 @@ static int decode_ascii_ssetup(char **pbcc_area, int bleft,
343 if (ses->serverOS) 346 if (ses->serverOS)
344 strncpy(ses->serverOS, bcc_ptr, len); 347 strncpy(ses->serverOS, bcc_ptr, len);
345 if (strncmp(ses->serverOS, "OS/2", 4) == 0) { 348 if (strncmp(ses->serverOS, "OS/2", 4) == 0) {
346 cFYI(1, ("OS/2 server")); 349 cFYI(1, "OS/2 server");
347 ses->flags |= CIFS_SES_OS2; 350 ses->flags |= CIFS_SES_OS2;
348 } 351 }
349 352
@@ -372,7 +375,7 @@ static int decode_ascii_ssetup(char **pbcc_area, int bleft,
372 /* BB For newer servers which do not support Unicode, 375 /* BB For newer servers which do not support Unicode,
373 but thus do return domain here we could add parsing 376 but thus do return domain here we could add parsing
374 for it later, but it is not very important */ 377 for it later, but it is not very important */
375 cFYI(1, ("ascii: bytes left %d", bleft)); 378 cFYI(1, "ascii: bytes left %d", bleft);
376 379
377 return rc; 380 return rc;
378} 381}
@@ -383,16 +386,16 @@ static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
383 CHALLENGE_MESSAGE *pblob = (CHALLENGE_MESSAGE *)bcc_ptr; 386 CHALLENGE_MESSAGE *pblob = (CHALLENGE_MESSAGE *)bcc_ptr;
384 387
385 if (blob_len < sizeof(CHALLENGE_MESSAGE)) { 388 if (blob_len < sizeof(CHALLENGE_MESSAGE)) {
386 cERROR(1, ("challenge blob len %d too small", blob_len)); 389 cERROR(1, "challenge blob len %d too small", blob_len);
387 return -EINVAL; 390 return -EINVAL;
388 } 391 }
389 392
390 if (memcmp(pblob->Signature, "NTLMSSP", 8)) { 393 if (memcmp(pblob->Signature, "NTLMSSP", 8)) {
391 cERROR(1, ("blob signature incorrect %s", pblob->Signature)); 394 cERROR(1, "blob signature incorrect %s", pblob->Signature);
392 return -EINVAL; 395 return -EINVAL;
393 } 396 }
394 if (pblob->MessageType != NtLmChallenge) { 397 if (pblob->MessageType != NtLmChallenge) {
395 cERROR(1, ("Incorrect message type %d", pblob->MessageType)); 398 cERROR(1, "Incorrect message type %d", pblob->MessageType);
396 return -EINVAL; 399 return -EINVAL;
397 } 400 }
398 401
@@ -446,7 +449,7 @@ static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
446 This function returns the length of the data in the blob */ 449 This function returns the length of the data in the blob */
447static int build_ntlmssp_auth_blob(unsigned char *pbuffer, 450static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
448 struct cifsSesInfo *ses, 451 struct cifsSesInfo *ses,
449 const struct nls_table *nls_cp, int first) 452 const struct nls_table *nls_cp, bool first)
450{ 453{
451 AUTHENTICATE_MESSAGE *sec_blob = (AUTHENTICATE_MESSAGE *)pbuffer; 454 AUTHENTICATE_MESSAGE *sec_blob = (AUTHENTICATE_MESSAGE *)pbuffer;
452 __u32 flags; 455 __u32 flags;
@@ -545,7 +548,7 @@ static void setup_ntlmssp_neg_req(SESSION_SETUP_ANDX *pSMB,
545 548
546static int setup_ntlmssp_auth_req(SESSION_SETUP_ANDX *pSMB, 549static int setup_ntlmssp_auth_req(SESSION_SETUP_ANDX *pSMB,
547 struct cifsSesInfo *ses, 550 struct cifsSesInfo *ses,
548 const struct nls_table *nls, int first_time) 551 const struct nls_table *nls, bool first_time)
549{ 552{
550 int bloblen; 553 int bloblen;
551 554
@@ -558,8 +561,8 @@ static int setup_ntlmssp_auth_req(SESSION_SETUP_ANDX *pSMB,
558#endif 561#endif
559 562
560int 563int
561CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time, 564CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
562 const struct nls_table *nls_cp) 565 const struct nls_table *nls_cp)
563{ 566{
564 int rc = 0; 567 int rc = 0;
565 int wct; 568 int wct;
@@ -576,13 +579,18 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
576 int bytes_remaining; 579 int bytes_remaining;
577 struct key *spnego_key = NULL; 580 struct key *spnego_key = NULL;
578 __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */ 581 __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */
582 bool first_time;
579 583
580 if (ses == NULL) 584 if (ses == NULL)
581 return -EINVAL; 585 return -EINVAL;
582 586
587 read_lock(&cifs_tcp_ses_lock);
588 first_time = is_first_ses_reconnect(ses);
589 read_unlock(&cifs_tcp_ses_lock);
590
583 type = ses->server->secType; 591 type = ses->server->secType;
584 592
585 cFYI(1, ("sess setup type %d", type)); 593 cFYI(1, "sess setup type %d", type);
586ssetup_ntlmssp_authenticate: 594ssetup_ntlmssp_authenticate:
587 if (phase == NtLmChallenge) 595 if (phase == NtLmChallenge)
588 phase = NtLmAuthenticate; /* if ntlmssp, now final phase */ 596 phase = NtLmAuthenticate; /* if ntlmssp, now final phase */
@@ -663,7 +671,7 @@ ssetup_ntlmssp_authenticate:
663 changed to do higher than lanman dialect and 671 changed to do higher than lanman dialect and
664 we reconnected would we ever calc signing_key? */ 672 we reconnected would we ever calc signing_key? */
665 673
666 cFYI(1, ("Negotiating LANMAN setting up strings")); 674 cFYI(1, "Negotiating LANMAN setting up strings");
667 /* Unicode not allowed for LANMAN dialects */ 675 /* Unicode not allowed for LANMAN dialects */
668 ascii_ssetup_strings(&bcc_ptr, ses, nls_cp); 676 ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
669#endif 677#endif
@@ -743,7 +751,7 @@ ssetup_ntlmssp_authenticate:
743 unicode_ssetup_strings(&bcc_ptr, ses, nls_cp); 751 unicode_ssetup_strings(&bcc_ptr, ses, nls_cp);
744 } else 752 } else
745 ascii_ssetup_strings(&bcc_ptr, ses, nls_cp); 753 ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
746 } else if (type == Kerberos || type == MSKerberos) { 754 } else if (type == Kerberos) {
747#ifdef CONFIG_CIFS_UPCALL 755#ifdef CONFIG_CIFS_UPCALL
748 struct cifs_spnego_msg *msg; 756 struct cifs_spnego_msg *msg;
749 spnego_key = cifs_get_spnego_key(ses); 757 spnego_key = cifs_get_spnego_key(ses);
@@ -757,17 +765,17 @@ ssetup_ntlmssp_authenticate:
757 /* check version field to make sure that cifs.upcall is 765 /* check version field to make sure that cifs.upcall is
758 sending us a response in an expected form */ 766 sending us a response in an expected form */
759 if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) { 767 if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) {
760 cERROR(1, ("incorrect version of cifs.upcall (expected" 768 cERROR(1, "incorrect version of cifs.upcall (expected"
761 " %d but got %d)", 769 " %d but got %d)",
762 CIFS_SPNEGO_UPCALL_VERSION, msg->version)); 770 CIFS_SPNEGO_UPCALL_VERSION, msg->version);
763 rc = -EKEYREJECTED; 771 rc = -EKEYREJECTED;
764 goto ssetup_exit; 772 goto ssetup_exit;
765 } 773 }
766 /* bail out if key is too long */ 774 /* bail out if key is too long */
767 if (msg->sesskey_len > 775 if (msg->sesskey_len >
768 sizeof(ses->server->mac_signing_key.data.krb5)) { 776 sizeof(ses->server->mac_signing_key.data.krb5)) {
769 cERROR(1, ("Kerberos signing key too long (%u bytes)", 777 cERROR(1, "Kerberos signing key too long (%u bytes)",
770 msg->sesskey_len)); 778 msg->sesskey_len);
771 rc = -EOVERFLOW; 779 rc = -EOVERFLOW;
772 goto ssetup_exit; 780 goto ssetup_exit;
773 } 781 }
@@ -795,7 +803,7 @@ ssetup_ntlmssp_authenticate:
795 /* BB: is this right? */ 803 /* BB: is this right? */
796 ascii_ssetup_strings(&bcc_ptr, ses, nls_cp); 804 ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
797#else /* ! CONFIG_CIFS_UPCALL */ 805#else /* ! CONFIG_CIFS_UPCALL */
798 cERROR(1, ("Kerberos negotiated but upcall support disabled!")); 806 cERROR(1, "Kerberos negotiated but upcall support disabled!");
799 rc = -ENOSYS; 807 rc = -ENOSYS;
800 goto ssetup_exit; 808 goto ssetup_exit;
801#endif /* CONFIG_CIFS_UPCALL */ 809#endif /* CONFIG_CIFS_UPCALL */
@@ -803,12 +811,12 @@ ssetup_ntlmssp_authenticate:
803#ifdef CONFIG_CIFS_EXPERIMENTAL 811#ifdef CONFIG_CIFS_EXPERIMENTAL
804 if (type == RawNTLMSSP) { 812 if (type == RawNTLMSSP) {
805 if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) { 813 if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) {
806 cERROR(1, ("NTLMSSP requires Unicode support")); 814 cERROR(1, "NTLMSSP requires Unicode support");
807 rc = -ENOSYS; 815 rc = -ENOSYS;
808 goto ssetup_exit; 816 goto ssetup_exit;
809 } 817 }
810 818
811 cFYI(1, ("ntlmssp session setup phase %d", phase)); 819 cFYI(1, "ntlmssp session setup phase %d", phase);
812 pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC; 820 pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
813 capabilities |= CAP_EXTENDED_SECURITY; 821 capabilities |= CAP_EXTENDED_SECURITY;
814 pSMB->req.Capabilities |= cpu_to_le32(capabilities); 822 pSMB->req.Capabilities |= cpu_to_le32(capabilities);
@@ -826,7 +834,7 @@ ssetup_ntlmssp_authenticate:
826 on the response (challenge) */ 834 on the response (challenge) */
827 smb_buf->Uid = ses->Suid; 835 smb_buf->Uid = ses->Suid;
828 } else { 836 } else {
829 cERROR(1, ("invalid phase %d", phase)); 837 cERROR(1, "invalid phase %d", phase);
830 rc = -ENOSYS; 838 rc = -ENOSYS;
831 goto ssetup_exit; 839 goto ssetup_exit;
832 } 840 }
@@ -838,12 +846,12 @@ ssetup_ntlmssp_authenticate:
838 } 846 }
839 unicode_oslm_strings(&bcc_ptr, nls_cp); 847 unicode_oslm_strings(&bcc_ptr, nls_cp);
840 } else { 848 } else {
841 cERROR(1, ("secType %d not supported!", type)); 849 cERROR(1, "secType %d not supported!", type);
842 rc = -ENOSYS; 850 rc = -ENOSYS;
843 goto ssetup_exit; 851 goto ssetup_exit;
844 } 852 }
845#else 853#else
846 cERROR(1, ("secType %d not supported!", type)); 854 cERROR(1, "secType %d not supported!", type);
847 rc = -ENOSYS; 855 rc = -ENOSYS;
848 goto ssetup_exit; 856 goto ssetup_exit;
849#endif 857#endif
@@ -861,7 +869,7 @@ ssetup_ntlmssp_authenticate:
861 CIFS_STD_OP /* not long */ | CIFS_LOG_ERROR); 869 CIFS_STD_OP /* not long */ | CIFS_LOG_ERROR);
862 /* SMB request buf freed in SendReceive2 */ 870 /* SMB request buf freed in SendReceive2 */
863 871
864 cFYI(1, ("ssetup rc from sendrecv2 is %d", rc)); 872 cFYI(1, "ssetup rc from sendrecv2 is %d", rc);
865 873
866 pSMB = (SESSION_SETUP_ANDX *)iov[0].iov_base; 874 pSMB = (SESSION_SETUP_ANDX *)iov[0].iov_base;
867 smb_buf = (struct smb_hdr *)iov[0].iov_base; 875 smb_buf = (struct smb_hdr *)iov[0].iov_base;
@@ -869,7 +877,7 @@ ssetup_ntlmssp_authenticate:
869 if ((type == RawNTLMSSP) && (smb_buf->Status.CifsError == 877 if ((type == RawNTLMSSP) && (smb_buf->Status.CifsError ==
870 cpu_to_le32(NT_STATUS_MORE_PROCESSING_REQUIRED))) { 878 cpu_to_le32(NT_STATUS_MORE_PROCESSING_REQUIRED))) {
871 if (phase != NtLmNegotiate) { 879 if (phase != NtLmNegotiate) {
872 cERROR(1, ("Unexpected more processing error")); 880 cERROR(1, "Unexpected more processing error");
873 goto ssetup_exit; 881 goto ssetup_exit;
874 } 882 }
875 /* NTLMSSP Negotiate sent now processing challenge (response) */ 883 /* NTLMSSP Negotiate sent now processing challenge (response) */
@@ -881,14 +889,14 @@ ssetup_ntlmssp_authenticate:
881 889
882 if ((smb_buf->WordCount != 3) && (smb_buf->WordCount != 4)) { 890 if ((smb_buf->WordCount != 3) && (smb_buf->WordCount != 4)) {
883 rc = -EIO; 891 rc = -EIO;
884 cERROR(1, ("bad word count %d", smb_buf->WordCount)); 892 cERROR(1, "bad word count %d", smb_buf->WordCount);
885 goto ssetup_exit; 893 goto ssetup_exit;
886 } 894 }
887 action = le16_to_cpu(pSMB->resp.Action); 895 action = le16_to_cpu(pSMB->resp.Action);
888 if (action & GUEST_LOGIN) 896 if (action & GUEST_LOGIN)
889 cFYI(1, ("Guest login")); /* BB mark SesInfo struct? */ 897 cFYI(1, "Guest login"); /* BB mark SesInfo struct? */
890 ses->Suid = smb_buf->Uid; /* UID left in wire format (le) */ 898 ses->Suid = smb_buf->Uid; /* UID left in wire format (le) */
891 cFYI(1, ("UID = %d ", ses->Suid)); 899 cFYI(1, "UID = %d ", ses->Suid);
892 /* response can have either 3 or 4 word count - Samba sends 3 */ 900 /* response can have either 3 or 4 word count - Samba sends 3 */
893 /* and lanman response is 3 */ 901 /* and lanman response is 3 */
894 bytes_remaining = BCC(smb_buf); 902 bytes_remaining = BCC(smb_buf);
@@ -898,7 +906,7 @@ ssetup_ntlmssp_authenticate:
898 __u16 blob_len; 906 __u16 blob_len;
899 blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength); 907 blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength);
900 if (blob_len > bytes_remaining) { 908 if (blob_len > bytes_remaining) {
901 cERROR(1, ("bad security blob length %d", blob_len)); 909 cERROR(1, "bad security blob length %d", blob_len);
902 rc = -EINVAL; 910 rc = -EINVAL;
903 goto ssetup_exit; 911 goto ssetup_exit;
904 } 912 }
@@ -932,7 +940,7 @@ ssetup_exit:
932 } 940 }
933 kfree(str_area); 941 kfree(str_area);
934 if (resp_buf_type == CIFS_SMALL_BUFFER) { 942 if (resp_buf_type == CIFS_SMALL_BUFFER) {
935 cFYI(1, ("ssetup freeing small buf %p", iov[0].iov_base)); 943 cFYI(1, "ssetup freeing small buf %p", iov[0].iov_base);
936 cifs_small_buf_release(iov[0].iov_base); 944 cifs_small_buf_release(iov[0].iov_base);
937 } else if (resp_buf_type == CIFS_LARGE_BUFFER) 945 } else if (resp_buf_type == CIFS_LARGE_BUFFER)
938 cifs_buf_release(iov[0].iov_base); 946 cifs_buf_release(iov[0].iov_base);
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index 93fb09a99c69..192ea51af20f 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -24,6 +24,7 @@
24*/ 24*/
25 25
26#include <linux/module.h> 26#include <linux/module.h>
27#include <linux/slab.h>
27#include <linux/fs.h> 28#include <linux/fs.h>
28#include <linux/string.h> 29#include <linux/string.h>
29#include <linux/kernel.h> 30#include <linux/kernel.h>
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 07b8e71544ee..82f78c4d6978 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -22,6 +22,7 @@
22 22
23#include <linux/fs.h> 23#include <linux/fs.h>
24#include <linux/list.h> 24#include <linux/list.h>
25#include <linux/gfp.h>
25#include <linux/wait.h> 26#include <linux/wait.h>
26#include <linux/net.h> 27#include <linux/net.h>
27#include <linux/delay.h> 28#include <linux/delay.h>
@@ -34,7 +35,6 @@
34#include "cifs_debug.h" 35#include "cifs_debug.h"
35 36
36extern mempool_t *cifs_mid_poolp; 37extern mempool_t *cifs_mid_poolp;
37extern struct kmem_cache *cifs_oplock_cachep;
38 38
39static struct mid_q_entry * 39static struct mid_q_entry *
40AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server) 40AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
@@ -42,7 +42,7 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
42 struct mid_q_entry *temp; 42 struct mid_q_entry *temp;
43 43
44 if (server == NULL) { 44 if (server == NULL) {
45 cERROR(1, ("Null TCP session in AllocMidQEntry")); 45 cERROR(1, "Null TCP session in AllocMidQEntry");
46 return NULL; 46 return NULL;
47 } 47 }
48 48
@@ -54,7 +54,7 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
54 temp->mid = smb_buffer->Mid; /* always LE */ 54 temp->mid = smb_buffer->Mid; /* always LE */
55 temp->pid = current->pid; 55 temp->pid = current->pid;
56 temp->command = smb_buffer->Command; 56 temp->command = smb_buffer->Command;
57 cFYI(1, ("For smb_command %d", temp->command)); 57 cFYI(1, "For smb_command %d", temp->command);
58 /* do_gettimeofday(&temp->when_sent);*/ /* easier to use jiffies */ 58 /* do_gettimeofday(&temp->when_sent);*/ /* easier to use jiffies */
59 /* when mid allocated can be before when sent */ 59 /* when mid allocated can be before when sent */
60 temp->when_alloc = jiffies; 60 temp->when_alloc = jiffies;
@@ -139,7 +139,7 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
139 total_len += iov[i].iov_len; 139 total_len += iov[i].iov_len;
140 140
141 smb_buffer->smb_buf_length = cpu_to_be32(smb_buffer->smb_buf_length); 141 smb_buffer->smb_buf_length = cpu_to_be32(smb_buffer->smb_buf_length);
142 cFYI(1, ("Sending smb: total_len %d", total_len)); 142 cFYI(1, "Sending smb: total_len %d", total_len);
143 dump_smb(smb_buffer, len); 143 dump_smb(smb_buffer, len);
144 144
145 i = 0; 145 i = 0;
@@ -167,9 +167,8 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
167 reconnect which may clear the network problem. 167 reconnect which may clear the network problem.
168 */ 168 */
169 if ((i >= 14) || (!server->noblocksnd && (i > 2))) { 169 if ((i >= 14) || (!server->noblocksnd && (i > 2))) {
170 cERROR(1, 170 cERROR(1, "sends on sock %p stuck for 15 seconds",
171 ("sends on sock %p stuck for 15 seconds", 171 ssocket);
172 ssocket));
173 rc = -EAGAIN; 172 rc = -EAGAIN;
174 break; 173 break;
175 } 174 }
@@ -183,13 +182,13 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
183 total_len = 0; 182 total_len = 0;
184 break; 183 break;
185 } else if (rc > total_len) { 184 } else if (rc > total_len) {
186 cERROR(1, ("sent %d requested %d", rc, total_len)); 185 cERROR(1, "sent %d requested %d", rc, total_len);
187 break; 186 break;
188 } 187 }
189 if (rc == 0) { 188 if (rc == 0) {
190 /* should never happen, letting socket clear before 189 /* should never happen, letting socket clear before
191 retrying is our only obvious option here */ 190 retrying is our only obvious option here */
192 cERROR(1, ("tcp sent no data")); 191 cERROR(1, "tcp sent no data");
193 msleep(500); 192 msleep(500);
194 continue; 193 continue;
195 } 194 }
@@ -212,8 +211,8 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
212 } 211 }
213 212
214 if ((total_len > 0) && (total_len != smb_buf_length + 4)) { 213 if ((total_len > 0) && (total_len != smb_buf_length + 4)) {
215 cFYI(1, ("partial send (%d remaining), terminating session", 214 cFYI(1, "partial send (%d remaining), terminating session",
216 total_len)); 215 total_len);
217 /* If we have only sent part of an SMB then the next SMB 216 /* If we have only sent part of an SMB then the next SMB
218 could be taken as the remainder of this one. We need 217 could be taken as the remainder of this one. We need
219 to kill the socket so the server throws away the partial 218 to kill the socket so the server throws away the partial
@@ -222,7 +221,7 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
222 } 221 }
223 222
224 if (rc < 0) { 223 if (rc < 0) {
225 cERROR(1, ("Error %d sending data on socket to server", rc)); 224 cERROR(1, "Error %d sending data on socket to server", rc);
226 } else 225 } else
227 rc = 0; 226 rc = 0;
228 227
@@ -295,7 +294,7 @@ static int allocate_mid(struct cifsSesInfo *ses, struct smb_hdr *in_buf,
295 } 294 }
296 295
297 if (ses->server->tcpStatus == CifsNeedReconnect) { 296 if (ses->server->tcpStatus == CifsNeedReconnect) {
298 cFYI(1, ("tcp session dead - return to caller to retry")); 297 cFYI(1, "tcp session dead - return to caller to retry");
299 return -EAGAIN; 298 return -EAGAIN;
300 } 299 }
301 300
@@ -347,7 +346,7 @@ static int wait_for_response(struct cifsSesInfo *ses,
347 lrt += time_to_wait; 346 lrt += time_to_wait;
348 if (time_after(jiffies, lrt)) { 347 if (time_after(jiffies, lrt)) {
349 /* No replies for time_to_wait. */ 348 /* No replies for time_to_wait. */
350 cERROR(1, ("server not responding")); 349 cERROR(1, "server not responding");
351 return -1; 350 return -1;
352 } 351 }
353 } else { 352 } else {
@@ -378,7 +377,7 @@ SendReceiveNoRsp(const unsigned int xid, struct cifsSesInfo *ses,
378 iov[0].iov_len = in_buf->smb_buf_length + 4; 377 iov[0].iov_len = in_buf->smb_buf_length + 4;
379 flags |= CIFS_NO_RESP; 378 flags |= CIFS_NO_RESP;
380 rc = SendReceive2(xid, ses, iov, 1, &resp_buf_type, flags); 379 rc = SendReceive2(xid, ses, iov, 1, &resp_buf_type, flags);
381 cFYI(DBG2, ("SendRcvNoRsp flags %d rc %d", flags, rc)); 380 cFYI(DBG2, "SendRcvNoRsp flags %d rc %d", flags, rc);
382 381
383 return rc; 382 return rc;
384} 383}
@@ -401,7 +400,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
401 400
402 if ((ses == NULL) || (ses->server == NULL)) { 401 if ((ses == NULL) || (ses->server == NULL)) {
403 cifs_small_buf_release(in_buf); 402 cifs_small_buf_release(in_buf);
404 cERROR(1, ("Null session")); 403 cERROR(1, "Null session");
405 return -EIO; 404 return -EIO;
406 } 405 }
407 406
@@ -470,7 +469,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
470 else if (long_op == CIFS_BLOCKING_OP) 469 else if (long_op == CIFS_BLOCKING_OP)
471 timeout = 0x7FFFFFFF; /* large, but not so large as to wrap */ 470 timeout = 0x7FFFFFFF; /* large, but not so large as to wrap */
472 else { 471 else {
473 cERROR(1, ("unknown timeout flag %d", long_op)); 472 cERROR(1, "unknown timeout flag %d", long_op);
474 rc = -EIO; 473 rc = -EIO;
475 goto out; 474 goto out;
476 } 475 }
@@ -489,8 +488,8 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
489 spin_lock(&GlobalMid_Lock); 488 spin_lock(&GlobalMid_Lock);
490 489
491 if (midQ->resp_buf == NULL) { 490 if (midQ->resp_buf == NULL) {
492 cERROR(1, ("No response to cmd %d mid %d", 491 cERROR(1, "No response to cmd %d mid %d",
493 midQ->command, midQ->mid)); 492 midQ->command, midQ->mid);
494 if (midQ->midState == MID_REQUEST_SUBMITTED) { 493 if (midQ->midState == MID_REQUEST_SUBMITTED) {
495 if (ses->server->tcpStatus == CifsExiting) 494 if (ses->server->tcpStatus == CifsExiting)
496 rc = -EHOSTDOWN; 495 rc = -EHOSTDOWN;
@@ -503,7 +502,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
503 if (rc != -EHOSTDOWN) { 502 if (rc != -EHOSTDOWN) {
504 if (midQ->midState == MID_RETRY_NEEDED) { 503 if (midQ->midState == MID_RETRY_NEEDED) {
505 rc = -EAGAIN; 504 rc = -EAGAIN;
506 cFYI(1, ("marking request for retry")); 505 cFYI(1, "marking request for retry");
507 } else { 506 } else {
508 rc = -EIO; 507 rc = -EIO;
509 } 508 }
@@ -520,8 +519,8 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
520 receive_len = midQ->resp_buf->smb_buf_length; 519 receive_len = midQ->resp_buf->smb_buf_length;
521 520
522 if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) { 521 if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
523 cERROR(1, ("Frame too large received. Length: %d Xid: %d", 522 cERROR(1, "Frame too large received. Length: %d Xid: %d",
524 receive_len, xid)); 523 receive_len, xid);
525 rc = -EIO; 524 rc = -EIO;
526 goto out; 525 goto out;
527 } 526 }
@@ -547,7 +546,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
547 &ses->server->mac_signing_key, 546 &ses->server->mac_signing_key,
548 midQ->sequence_number+1); 547 midQ->sequence_number+1);
549 if (rc) { 548 if (rc) {
550 cERROR(1, ("Unexpected SMB signature")); 549 cERROR(1, "Unexpected SMB signature");
551 /* BB FIXME add code to kill session */ 550 /* BB FIXME add code to kill session */
552 } 551 }
553 } 552 }
@@ -568,7 +567,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
568 DeleteMidQEntry */ 567 DeleteMidQEntry */
569 } else { 568 } else {
570 rc = -EIO; 569 rc = -EIO;
571 cFYI(1, ("Bad MID state?")); 570 cFYI(1, "Bad MID state?");
572 } 571 }
573 572
574out: 573out:
@@ -590,11 +589,11 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
590 struct mid_q_entry *midQ; 589 struct mid_q_entry *midQ;
591 590
592 if (ses == NULL) { 591 if (ses == NULL) {
593 cERROR(1, ("Null smb session")); 592 cERROR(1, "Null smb session");
594 return -EIO; 593 return -EIO;
595 } 594 }
596 if (ses->server == NULL) { 595 if (ses->server == NULL) {
597 cERROR(1, ("Null tcp session")); 596 cERROR(1, "Null tcp session");
598 return -EIO; 597 return -EIO;
599 } 598 }
600 599
@@ -606,8 +605,8 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
606 use ses->maxReq */ 605 use ses->maxReq */
607 606
608 if (in_buf->smb_buf_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) { 607 if (in_buf->smb_buf_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
609 cERROR(1, ("Illegal length, greater than maximum frame, %d", 608 cERROR(1, "Illegal length, greater than maximum frame, %d",
610 in_buf->smb_buf_length)); 609 in_buf->smb_buf_length);
611 return -EIO; 610 return -EIO;
612 } 611 }
613 612
@@ -664,7 +663,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
664 else if (long_op == CIFS_BLOCKING_OP) 663 else if (long_op == CIFS_BLOCKING_OP)
665 timeout = 0x7FFFFFFF; /* large but no so large as to wrap */ 664 timeout = 0x7FFFFFFF; /* large but no so large as to wrap */
666 else { 665 else {
667 cERROR(1, ("unknown timeout flag %d", long_op)); 666 cERROR(1, "unknown timeout flag %d", long_op);
668 rc = -EIO; 667 rc = -EIO;
669 goto out; 668 goto out;
670 } 669 }
@@ -680,8 +679,8 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
680 679
681 spin_lock(&GlobalMid_Lock); 680 spin_lock(&GlobalMid_Lock);
682 if (midQ->resp_buf == NULL) { 681 if (midQ->resp_buf == NULL) {
683 cERROR(1, ("No response for cmd %d mid %d", 682 cERROR(1, "No response for cmd %d mid %d",
684 midQ->command, midQ->mid)); 683 midQ->command, midQ->mid);
685 if (midQ->midState == MID_REQUEST_SUBMITTED) { 684 if (midQ->midState == MID_REQUEST_SUBMITTED) {
686 if (ses->server->tcpStatus == CifsExiting) 685 if (ses->server->tcpStatus == CifsExiting)
687 rc = -EHOSTDOWN; 686 rc = -EHOSTDOWN;
@@ -694,7 +693,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
694 if (rc != -EHOSTDOWN) { 693 if (rc != -EHOSTDOWN) {
695 if (midQ->midState == MID_RETRY_NEEDED) { 694 if (midQ->midState == MID_RETRY_NEEDED) {
696 rc = -EAGAIN; 695 rc = -EAGAIN;
697 cFYI(1, ("marking request for retry")); 696 cFYI(1, "marking request for retry");
698 } else { 697 } else {
699 rc = -EIO; 698 rc = -EIO;
700 } 699 }
@@ -711,8 +710,8 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
711 receive_len = midQ->resp_buf->smb_buf_length; 710 receive_len = midQ->resp_buf->smb_buf_length;
712 711
713 if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) { 712 if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
714 cERROR(1, ("Frame too large received. Length: %d Xid: %d", 713 cERROR(1, "Frame too large received. Length: %d Xid: %d",
715 receive_len, xid)); 714 receive_len, xid);
716 rc = -EIO; 715 rc = -EIO;
717 goto out; 716 goto out;
718 } 717 }
@@ -735,7 +734,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
735 &ses->server->mac_signing_key, 734 &ses->server->mac_signing_key,
736 midQ->sequence_number+1); 735 midQ->sequence_number+1);
737 if (rc) { 736 if (rc) {
738 cERROR(1, ("Unexpected SMB signature")); 737 cERROR(1, "Unexpected SMB signature");
739 /* BB FIXME add code to kill session */ 738 /* BB FIXME add code to kill session */
740 } 739 }
741 } 740 }
@@ -752,7 +751,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
752 BCC(out_buf) = le16_to_cpu(BCC_LE(out_buf)); 751 BCC(out_buf) = le16_to_cpu(BCC_LE(out_buf));
753 } else { 752 } else {
754 rc = -EIO; 753 rc = -EIO;
755 cERROR(1, ("Bad MID state?")); 754 cERROR(1, "Bad MID state?");
756 } 755 }
757 756
758out: 757out:
@@ -823,13 +822,13 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
823 struct cifsSesInfo *ses; 822 struct cifsSesInfo *ses;
824 823
825 if (tcon == NULL || tcon->ses == NULL) { 824 if (tcon == NULL || tcon->ses == NULL) {
826 cERROR(1, ("Null smb session")); 825 cERROR(1, "Null smb session");
827 return -EIO; 826 return -EIO;
828 } 827 }
829 ses = tcon->ses; 828 ses = tcon->ses;
830 829
831 if (ses->server == NULL) { 830 if (ses->server == NULL) {
832 cERROR(1, ("Null tcp session")); 831 cERROR(1, "Null tcp session");
833 return -EIO; 832 return -EIO;
834 } 833 }
835 834
@@ -841,8 +840,8 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
841 use ses->maxReq */ 840 use ses->maxReq */
842 841
843 if (in_buf->smb_buf_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) { 842 if (in_buf->smb_buf_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
844 cERROR(1, ("Illegal length, greater than maximum frame, %d", 843 cERROR(1, "Illegal length, greater than maximum frame, %d",
845 in_buf->smb_buf_length)); 844 in_buf->smb_buf_length);
846 return -EIO; 845 return -EIO;
847 } 846 }
848 847
@@ -932,8 +931,8 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
932 spin_unlock(&GlobalMid_Lock); 931 spin_unlock(&GlobalMid_Lock);
933 receive_len = midQ->resp_buf->smb_buf_length; 932 receive_len = midQ->resp_buf->smb_buf_length;
934 } else { 933 } else {
935 cERROR(1, ("No response for cmd %d mid %d", 934 cERROR(1, "No response for cmd %d mid %d",
936 midQ->command, midQ->mid)); 935 midQ->command, midQ->mid);
937 if (midQ->midState == MID_REQUEST_SUBMITTED) { 936 if (midQ->midState == MID_REQUEST_SUBMITTED) {
938 if (ses->server->tcpStatus == CifsExiting) 937 if (ses->server->tcpStatus == CifsExiting)
939 rc = -EHOSTDOWN; 938 rc = -EHOSTDOWN;
@@ -946,7 +945,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
946 if (rc != -EHOSTDOWN) { 945 if (rc != -EHOSTDOWN) {
947 if (midQ->midState == MID_RETRY_NEEDED) { 946 if (midQ->midState == MID_RETRY_NEEDED) {
948 rc = -EAGAIN; 947 rc = -EAGAIN;
949 cFYI(1, ("marking request for retry")); 948 cFYI(1, "marking request for retry");
950 } else { 949 } else {
951 rc = -EIO; 950 rc = -EIO;
952 } 951 }
@@ -957,8 +956,8 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
957 } 956 }
958 957
959 if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) { 958 if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
960 cERROR(1, ("Frame too large received. Length: %d Xid: %d", 959 cERROR(1, "Frame too large received. Length: %d Xid: %d",
961 receive_len, xid)); 960 receive_len, xid);
962 rc = -EIO; 961 rc = -EIO;
963 goto out; 962 goto out;
964 } 963 }
@@ -967,7 +966,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
967 966
968 if ((out_buf == NULL) || (midQ->midState != MID_RESPONSE_RECEIVED)) { 967 if ((out_buf == NULL) || (midQ->midState != MID_RESPONSE_RECEIVED)) {
969 rc = -EIO; 968 rc = -EIO;
970 cERROR(1, ("Bad MID state?")); 969 cERROR(1, "Bad MID state?");
971 goto out; 970 goto out;
972 } 971 }
973 972
@@ -985,7 +984,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
985 &ses->server->mac_signing_key, 984 &ses->server->mac_signing_key,
986 midQ->sequence_number+1); 985 midQ->sequence_number+1);
987 if (rc) { 986 if (rc) {
988 cERROR(1, ("Unexpected SMB signature")); 987 cERROR(1, "Unexpected SMB signature");
989 /* BB FIXME add code to kill session */ 988 /* BB FIXME add code to kill session */
990 } 989 }
991 } 990 }
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index 3e2ef0de1209..a1509207bfa6 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -21,6 +21,7 @@
21 21
22#include <linux/fs.h> 22#include <linux/fs.h>
23#include <linux/posix_acl_xattr.h> 23#include <linux/posix_acl_xattr.h>
24#include <linux/slab.h>
24#include "cifsfs.h" 25#include "cifsfs.h"
25#include "cifspdu.h" 26#include "cifspdu.h"
26#include "cifsglob.h" 27#include "cifsglob.h"
@@ -69,12 +70,12 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name)
69 return rc; 70 return rc;
70 } 71 }
71 if (ea_name == NULL) { 72 if (ea_name == NULL) {
72 cFYI(1, ("Null xattr names not supported")); 73 cFYI(1, "Null xattr names not supported");
73 } else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5) 74 } else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5)
74 && (strncmp(ea_name, CIFS_XATTR_OS2_PREFIX, 4))) { 75 && (strncmp(ea_name, CIFS_XATTR_OS2_PREFIX, 4))) {
75 cFYI(1, 76 cFYI(1,
76 ("illegal xattr request %s (only user namespace supported)", 77 "illegal xattr request %s (only user namespace supported)",
77 ea_name)); 78 ea_name);
78 /* BB what if no namespace prefix? */ 79 /* BB what if no namespace prefix? */
79 /* Should we just pass them to server, except for 80 /* Should we just pass them to server, except for
80 system and perhaps security prefixes? */ 81 system and perhaps security prefixes? */
@@ -130,19 +131,19 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
130 search server for EAs or streams to 131 search server for EAs or streams to
131 returns as xattrs */ 132 returns as xattrs */
132 if (value_size > MAX_EA_VALUE_SIZE) { 133 if (value_size > MAX_EA_VALUE_SIZE) {
133 cFYI(1, ("size of EA value too large")); 134 cFYI(1, "size of EA value too large");
134 kfree(full_path); 135 kfree(full_path);
135 FreeXid(xid); 136 FreeXid(xid);
136 return -EOPNOTSUPP; 137 return -EOPNOTSUPP;
137 } 138 }
138 139
139 if (ea_name == NULL) { 140 if (ea_name == NULL) {
140 cFYI(1, ("Null xattr names not supported")); 141 cFYI(1, "Null xattr names not supported");
141 } else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5) == 0) { 142 } else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5) == 0) {
142 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) 143 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
143 goto set_ea_exit; 144 goto set_ea_exit;
144 if (strncmp(ea_name, CIFS_XATTR_DOS_ATTRIB, 14) == 0) 145 if (strncmp(ea_name, CIFS_XATTR_DOS_ATTRIB, 14) == 0)
145 cFYI(1, ("attempt to set cifs inode metadata")); 146 cFYI(1, "attempt to set cifs inode metadata");
146 147
147 ea_name += 5; /* skip past user. prefix */ 148 ea_name += 5; /* skip past user. prefix */
148 rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, ea_value, 149 rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, ea_value,
@@ -168,9 +169,9 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
168 ACL_TYPE_ACCESS, cifs_sb->local_nls, 169 ACL_TYPE_ACCESS, cifs_sb->local_nls,
169 cifs_sb->mnt_cifs_flags & 170 cifs_sb->mnt_cifs_flags &
170 CIFS_MOUNT_MAP_SPECIAL_CHR); 171 CIFS_MOUNT_MAP_SPECIAL_CHR);
171 cFYI(1, ("set POSIX ACL rc %d", rc)); 172 cFYI(1, "set POSIX ACL rc %d", rc);
172#else 173#else
173 cFYI(1, ("set POSIX ACL not supported")); 174 cFYI(1, "set POSIX ACL not supported");
174#endif 175#endif
175 } else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT, 176 } else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT,
176 strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) { 177 strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) {
@@ -181,13 +182,13 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
181 ACL_TYPE_DEFAULT, cifs_sb->local_nls, 182 ACL_TYPE_DEFAULT, cifs_sb->local_nls,
182 cifs_sb->mnt_cifs_flags & 183 cifs_sb->mnt_cifs_flags &
183 CIFS_MOUNT_MAP_SPECIAL_CHR); 184 CIFS_MOUNT_MAP_SPECIAL_CHR);
184 cFYI(1, ("set POSIX default ACL rc %d", rc)); 185 cFYI(1, "set POSIX default ACL rc %d", rc);
185#else 186#else
186 cFYI(1, ("set default POSIX ACL not supported")); 187 cFYI(1, "set default POSIX ACL not supported");
187#endif 188#endif
188 } else { 189 } else {
189 cFYI(1, ("illegal xattr request %s (only user namespace" 190 cFYI(1, "illegal xattr request %s (only user namespace"
190 " supported)", ea_name)); 191 " supported)", ea_name);
191 /* BB what if no namespace prefix? */ 192 /* BB what if no namespace prefix? */
192 /* Should we just pass them to server, except for 193 /* Should we just pass them to server, except for
193 system and perhaps security prefixes? */ 194 system and perhaps security prefixes? */
@@ -234,13 +235,13 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
234 /* return dos attributes as pseudo xattr */ 235 /* return dos attributes as pseudo xattr */
235 /* return alt name if available as pseudo attr */ 236 /* return alt name if available as pseudo attr */
236 if (ea_name == NULL) { 237 if (ea_name == NULL) {
237 cFYI(1, ("Null xattr names not supported")); 238 cFYI(1, "Null xattr names not supported");
238 } else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5) == 0) { 239 } else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5) == 0) {
239 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) 240 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
240 goto get_ea_exit; 241 goto get_ea_exit;
241 242
242 if (strncmp(ea_name, CIFS_XATTR_DOS_ATTRIB, 14) == 0) { 243 if (strncmp(ea_name, CIFS_XATTR_DOS_ATTRIB, 14) == 0) {
243 cFYI(1, ("attempt to query cifs inode metadata")); 244 cFYI(1, "attempt to query cifs inode metadata");
244 /* revalidate/getattr then populate from inode */ 245 /* revalidate/getattr then populate from inode */
245 } /* BB add else when above is implemented */ 246 } /* BB add else when above is implemented */
246 ea_name += 5; /* skip past user. prefix */ 247 ea_name += 5; /* skip past user. prefix */
@@ -286,7 +287,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
286 } 287 }
287#endif /* EXPERIMENTAL */ 288#endif /* EXPERIMENTAL */
288#else 289#else
289 cFYI(1, ("query POSIX ACL not supported yet")); 290 cFYI(1, "query POSIX ACL not supported yet");
290#endif /* CONFIG_CIFS_POSIX */ 291#endif /* CONFIG_CIFS_POSIX */
291 } else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT, 292 } else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT,
292 strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) { 293 strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) {
@@ -298,18 +299,18 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
298 cifs_sb->mnt_cifs_flags & 299 cifs_sb->mnt_cifs_flags &
299 CIFS_MOUNT_MAP_SPECIAL_CHR); 300 CIFS_MOUNT_MAP_SPECIAL_CHR);
300#else 301#else
301 cFYI(1, ("query POSIX default ACL not supported yet")); 302 cFYI(1, "query POSIX default ACL not supported yet");
302#endif 303#endif
303 } else if (strncmp(ea_name, 304 } else if (strncmp(ea_name,
304 CIFS_XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) { 305 CIFS_XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) {
305 cFYI(1, ("Trusted xattr namespace not supported yet")); 306 cFYI(1, "Trusted xattr namespace not supported yet");
306 } else if (strncmp(ea_name, 307 } else if (strncmp(ea_name,
307 CIFS_XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) == 0) { 308 CIFS_XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) == 0) {
308 cFYI(1, ("Security xattr namespace not supported yet")); 309 cFYI(1, "Security xattr namespace not supported yet");
309 } else 310 } else
310 cFYI(1, 311 cFYI(1,
311 ("illegal xattr request %s (only user namespace supported)", 312 "illegal xattr request %s (only user namespace supported)",
312 ea_name)); 313 ea_name);
313 314
314 /* We could add an additional check for streams ie 315 /* We could add an additional check for streams ie
315 if proc/fs/cifs/streamstoxattr is set then 316 if proc/fs/cifs/streamstoxattr is set then
diff --git a/fs/coda/coda_int.h b/fs/coda/coda_int.h
index d99860a33890..6b443ff43a19 100644
--- a/fs/coda/coda_int.h
+++ b/fs/coda/coda_int.h
@@ -11,8 +11,7 @@ extern int coda_fake_statfs;
11 11
12void coda_destroy_inodecache(void); 12void coda_destroy_inodecache(void);
13int coda_init_inodecache(void); 13int coda_init_inodecache(void);
14int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, 14int coda_fsync(struct file *coda_file, int datasync);
15 int datasync);
16void coda_sysctl_init(void); 15void coda_sysctl_init(void);
17void coda_sysctl_clean(void); 16void coda_sysctl_clean(void);
18 17
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 4bb9d0a5decc..ccd98b0f2b0b 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -12,6 +12,7 @@
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/time.h> 13#include <linux/time.h>
14#include <linux/fs.h> 14#include <linux/fs.h>
15#include <linux/slab.h>
15#include <linux/file.h> 16#include <linux/file.h>
16#include <linux/stat.h> 17#include <linux/stat.h>
17#include <linux/errno.h> 18#include <linux/errno.h>
diff --git a/fs/coda/file.c b/fs/coda/file.c
index ffd42815fda1..ad3cd2abeeb4 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -17,6 +17,7 @@
17#include <linux/errno.h> 17#include <linux/errno.h>
18#include <linux/smp_lock.h> 18#include <linux/smp_lock.h>
19#include <linux/string.h> 19#include <linux/string.h>
20#include <linux/slab.h>
20#include <asm/uaccess.h> 21#include <asm/uaccess.h>
21 22
22#include <linux/coda.h> 23#include <linux/coda.h>
@@ -201,10 +202,10 @@ int coda_release(struct inode *coda_inode, struct file *coda_file)
201 return 0; 202 return 0;
202} 203}
203 204
204int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync) 205int coda_fsync(struct file *coda_file, int datasync)
205{ 206{
206 struct file *host_file; 207 struct file *host_file;
207 struct inode *coda_inode = coda_dentry->d_inode; 208 struct inode *coda_inode = coda_file->f_path.dentry->d_inode;
208 struct coda_file_info *cfi; 209 struct coda_file_info *cfi;
209 int err = 0; 210 int err = 0;
210 211
@@ -216,7 +217,7 @@ int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync)
216 BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); 217 BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
217 host_file = cfi->cfi_container; 218 host_file = cfi->cfi_container;
218 219
219 err = vfs_fsync(host_file, host_file->f_path.dentry, datasync); 220 err = vfs_fsync(host_file, datasync);
220 if ( !err && !datasync ) { 221 if ( !err && !datasync ) {
221 lock_kernel(); 222 lock_kernel();
222 err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode)); 223 err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode));
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 830f51abb971..d97f9935a028 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -18,6 +18,7 @@
18#include <linux/smp_lock.h> 18#include <linux/smp_lock.h>
19#include <linux/file.h> 19#include <linux/file.h>
20#include <linux/vfs.h> 20#include <linux/vfs.h>
21#include <linux/slab.h>
21 22
22#include <asm/system.h> 23#include <asm/system.h>
23#include <asm/uaccess.h> 24#include <asm/uaccess.h>
@@ -166,6 +167,10 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
166 return -EBUSY; 167 return -EBUSY;
167 } 168 }
168 169
170 error = bdi_setup_and_register(&vc->bdi, "coda", BDI_CAP_MAP_COPY);
171 if (error)
172 goto bdi_err;
173
169 vc->vc_sb = sb; 174 vc->vc_sb = sb;
170 175
171 sb->s_fs_info = vc; 176 sb->s_fs_info = vc;
@@ -174,6 +179,7 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
174 sb->s_blocksize_bits = 12; 179 sb->s_blocksize_bits = 12;
175 sb->s_magic = CODA_SUPER_MAGIC; 180 sb->s_magic = CODA_SUPER_MAGIC;
176 sb->s_op = &coda_super_operations; 181 sb->s_op = &coda_super_operations;
182 sb->s_bdi = &vc->bdi;
177 183
178 /* get root fid from Venus: this needs the root inode */ 184 /* get root fid from Venus: this needs the root inode */
179 error = venus_rootfid(sb, &fid); 185 error = venus_rootfid(sb, &fid);
@@ -199,6 +205,8 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
199 return 0; 205 return 0;
200 206
201 error: 207 error:
208 bdi_destroy(&vc->bdi);
209 bdi_err:
202 if (root) 210 if (root)
203 iput(root); 211 iput(root);
204 if (vc) 212 if (vc)
@@ -209,6 +217,7 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
209 217
210static void coda_put_super(struct super_block *sb) 218static void coda_put_super(struct super_block *sb)
211{ 219{
220 bdi_destroy(&coda_vcp(sb)->bdi);
212 coda_vcp(sb)->vc_sb = NULL; 221 coda_vcp(sb)->vc_sb = NULL;
213 sb->s_fs_info = NULL; 222 sb->s_fs_info = NULL;
214 223
diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c
index 773f2ce9aa06..ca25d96d45c9 100644
--- a/fs/coda/pioctl.c
+++ b/fs/coda/pioctl.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * Pioctl operations for Coda. 2 * Pioctl operations for Coda.
3 * Original version: (C) 1996 Peter Braam 3 * Original version: (C) 1996 Peter Braam
4 * Rewritten for Linux 2.1: (C) 1997 Carnegie Mellon University 4 * Rewritten for Linux 2.1: (C) 1997 Carnegie Mellon University
5 * 5 *
6 * Carnegie Mellon encourages users of this code to contribute improvements 6 * Carnegie Mellon encourages users of this code to contribute improvements
@@ -23,21 +23,22 @@
23#include <linux/coda_fs_i.h> 23#include <linux/coda_fs_i.h>
24#include <linux/coda_psdev.h> 24#include <linux/coda_psdev.h>
25 25
26#include <linux/smp_lock.h>
27
26/* pioctl ops */ 28/* pioctl ops */
27static int coda_ioctl_permission(struct inode *inode, int mask); 29static int coda_ioctl_permission(struct inode *inode, int mask);
28static int coda_pioctl(struct inode * inode, struct file * filp, 30static long coda_pioctl(struct file *filp, unsigned int cmd,
29 unsigned int cmd, unsigned long user_data); 31 unsigned long user_data);
30 32
31/* exported from this file */ 33/* exported from this file */
32const struct inode_operations coda_ioctl_inode_operations = 34const struct inode_operations coda_ioctl_inode_operations = {
33{
34 .permission = coda_ioctl_permission, 35 .permission = coda_ioctl_permission,
35 .setattr = coda_setattr, 36 .setattr = coda_setattr,
36}; 37};
37 38
38const struct file_operations coda_ioctl_operations = { 39const struct file_operations coda_ioctl_operations = {
39 .owner = THIS_MODULE, 40 .owner = THIS_MODULE,
40 .ioctl = coda_pioctl, 41 .unlocked_ioctl = coda_pioctl,
41}; 42};
42 43
43/* the coda pioctl inode ops */ 44/* the coda pioctl inode ops */
@@ -46,48 +47,53 @@ static int coda_ioctl_permission(struct inode *inode, int mask)
46 return (mask & MAY_EXEC) ? -EACCES : 0; 47 return (mask & MAY_EXEC) ? -EACCES : 0;
47} 48}
48 49
49static int coda_pioctl(struct inode * inode, struct file * filp, 50static long coda_pioctl(struct file *filp, unsigned int cmd,
50 unsigned int cmd, unsigned long user_data) 51 unsigned long user_data)
51{ 52{
52 struct path path; 53 struct path path;
53 int error; 54 int error;
54 struct PioctlData data; 55 struct PioctlData data;
55 struct inode *target_inode = NULL; 56 struct inode *inode = filp->f_dentry->d_inode;
56 struct coda_inode_info *cnp; 57 struct inode *target_inode = NULL;
58 struct coda_inode_info *cnp;
57 59
58 /* get the Pioctl data arguments from user space */ 60 lock_kernel();
59 if (copy_from_user(&data, (void __user *)user_data, sizeof(data))) { 61
60 return -EINVAL; 62 /* get the Pioctl data arguments from user space */
61 } 63 if (copy_from_user(&data, (void __user *)user_data, sizeof(data))) {
62 64 error = -EINVAL;
63 /* 65 goto out;
64 * Look up the pathname. Note that the pathname is in
65 * user memory, and namei takes care of this
66 */
67 if (data.follow) {
68 error = user_path(data.path, &path);
69 } else {
70 error = user_lpath(data.path, &path);
71 } 66 }
72 67
73 if ( error ) { 68 /*
74 return error; 69 * Look up the pathname. Note that the pathname is in
75 } else { 70 * user memory, and namei takes care of this
71 */
72 if (data.follow)
73 error = user_path(data.path, &path);
74 else
75 error = user_lpath(data.path, &path);
76
77 if (error)
78 goto out;
79 else
76 target_inode = path.dentry->d_inode; 80 target_inode = path.dentry->d_inode;
77 } 81
78
79 /* return if it is not a Coda inode */ 82 /* return if it is not a Coda inode */
80 if ( target_inode->i_sb != inode->i_sb ) { 83 if (target_inode->i_sb != inode->i_sb) {
81 path_put(&path); 84 path_put(&path);
82 return -EINVAL; 85 error = -EINVAL;
86 goto out;
83 } 87 }
84 88
85 /* now proceed to make the upcall */ 89 /* now proceed to make the upcall */
86 cnp = ITOC(target_inode); 90 cnp = ITOC(target_inode);
87 91
88 error = venus_pioctl(inode->i_sb, &(cnp->c_fid), cmd, &data); 92 error = venus_pioctl(inode->i_sb, &(cnp->c_fid), cmd, &data);
89 93
90 path_put(&path); 94 path_put(&path);
91 return error;
92}
93 95
96out:
97 unlock_kernel();
98 return error;
99}
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index be4392ca2098..66b9cf79c5ba 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -73,8 +73,7 @@ static unsigned int coda_psdev_poll(struct file *file, poll_table * wait)
73 return mask; 73 return mask;
74} 74}
75 75
76static int coda_psdev_ioctl(struct inode * inode, struct file * filp, 76static long coda_psdev_ioctl(struct file * filp, unsigned int cmd, unsigned long arg)
77 unsigned int cmd, unsigned long arg)
78{ 77{
79 unsigned int data; 78 unsigned int data;
80 79
@@ -344,7 +343,7 @@ static const struct file_operations coda_psdev_fops = {
344 .read = coda_psdev_read, 343 .read = coda_psdev_read,
345 .write = coda_psdev_write, 344 .write = coda_psdev_write,
346 .poll = coda_psdev_poll, 345 .poll = coda_psdev_poll,
347 .ioctl = coda_psdev_ioctl, 346 .unlocked_ioctl = coda_psdev_ioctl,
348 .open = coda_psdev_open, 347 .open = coda_psdev_open,
349 .release = coda_psdev_release, 348 .release = coda_psdev_release,
350}; 349};
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index c274d949179d..f09c5ed76f6c 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -26,6 +26,7 @@
26#include <linux/stat.h> 26#include <linux/stat.h>
27#include <linux/errno.h> 27#include <linux/errno.h>
28#include <linux/string.h> 28#include <linux/string.h>
29#include <linux/slab.h>
29#include <asm/uaccess.h> 30#include <asm/uaccess.h>
30#include <linux/vmalloc.h> 31#include <linux/vmalloc.h>
31#include <linux/vfs.h> 32#include <linux/vfs.h>
diff --git a/fs/compat.c b/fs/compat.c
index 030602d453b7..f0b391c50552 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -49,6 +49,7 @@
49#include <linux/mm.h> 49#include <linux/mm.h>
50#include <linux/eventpoll.h> 50#include <linux/eventpoll.h>
51#include <linux/fs_struct.h> 51#include <linux/fs_struct.h>
52#include <linux/slab.h>
52 53
53#include <asm/uaccess.h> 54#include <asm/uaccess.h>
54#include <asm/mmu_context.h> 55#include <asm/mmu_context.h>
@@ -567,6 +568,79 @@ out:
567 return ret; 568 return ret;
568} 569}
569 570
571/* A write operation does a read from user space and vice versa */
572#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
573
574ssize_t compat_rw_copy_check_uvector(int type,
575 const struct compat_iovec __user *uvector, unsigned long nr_segs,
576 unsigned long fast_segs, struct iovec *fast_pointer,
577 struct iovec **ret_pointer)
578{
579 compat_ssize_t tot_len;
580 struct iovec *iov = *ret_pointer = fast_pointer;
581 ssize_t ret = 0;
582 int seg;
583
584 /*
585 * SuS says "The readv() function *may* fail if the iovcnt argument
586 * was less than or equal to 0, or greater than {IOV_MAX}. Linux has
587 * traditionally returned zero for zero segments, so...
588 */
589 if (nr_segs == 0)
590 goto out;
591
592 ret = -EINVAL;
593 if (nr_segs > UIO_MAXIOV || nr_segs < 0)
594 goto out;
595 if (nr_segs > fast_segs) {
596 ret = -ENOMEM;
597 iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
598 if (iov == NULL) {
599 *ret_pointer = fast_pointer;
600 goto out;
601 }
602 }
603 *ret_pointer = iov;
604
605 /*
606 * Single unix specification:
607 * We should -EINVAL if an element length is not >= 0 and fitting an
608 * ssize_t. The total length is fitting an ssize_t
609 *
610 * Be careful here because iov_len is a size_t not an ssize_t
611 */
612 tot_len = 0;
613 ret = -EINVAL;
614 for (seg = 0; seg < nr_segs; seg++) {
615 compat_ssize_t tmp = tot_len;
616 compat_uptr_t buf;
617 compat_ssize_t len;
618
619 if (__get_user(len, &uvector->iov_len) ||
620 __get_user(buf, &uvector->iov_base)) {
621 ret = -EFAULT;
622 goto out;
623 }
624 if (len < 0) /* size_t not fitting in compat_ssize_t .. */
625 goto out;
626 tot_len += len;
627 if (tot_len < tmp) /* maths overflow on the compat_ssize_t */
628 goto out;
629 if (!access_ok(vrfy_dir(type), buf, len)) {
630 ret = -EFAULT;
631 goto out;
632 }
633 iov->iov_base = compat_ptr(buf);
634 iov->iov_len = (compat_size_t) len;
635 uvector++;
636 iov++;
637 }
638 ret = tot_len;
639
640out:
641 return ret;
642}
643
570static inline long 644static inline long
571copy_iocb(long nr, u32 __user *ptr32, struct iocb __user * __user *ptr64) 645copy_iocb(long nr, u32 __user *ptr32, struct iocb __user * __user *ptr64)
572{ 646{
@@ -599,7 +673,7 @@ compat_sys_io_submit(aio_context_t ctx_id, int nr, u32 __user *iocb)
599 iocb64 = compat_alloc_user_space(nr * sizeof(*iocb64)); 673 iocb64 = compat_alloc_user_space(nr * sizeof(*iocb64));
600 ret = copy_iocb(nr, iocb, iocb64); 674 ret = copy_iocb(nr, iocb, iocb64);
601 if (!ret) 675 if (!ret)
602 ret = sys_io_submit(ctx_id, nr, iocb64); 676 ret = do_io_submit(ctx_id, nr, iocb64, 1);
603 return ret; 677 return ret;
604} 678}
605 679
@@ -1076,70 +1150,21 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
1076{ 1150{
1077 compat_ssize_t tot_len; 1151 compat_ssize_t tot_len;
1078 struct iovec iovstack[UIO_FASTIOV]; 1152 struct iovec iovstack[UIO_FASTIOV];
1079 struct iovec *iov=iovstack, *vector; 1153 struct iovec *iov;
1080 ssize_t ret; 1154 ssize_t ret;
1081 int seg;
1082 io_fn_t fn; 1155 io_fn_t fn;
1083 iov_fn_t fnv; 1156 iov_fn_t fnv;
1084 1157
1085 /*
1086 * SuS says "The readv() function *may* fail if the iovcnt argument
1087 * was less than or equal to 0, or greater than {IOV_MAX}. Linux has
1088 * traditionally returned zero for zero segments, so...
1089 */
1090 ret = 0;
1091 if (nr_segs == 0)
1092 goto out;
1093
1094 /*
1095 * First get the "struct iovec" from user memory and
1096 * verify all the pointers
1097 */
1098 ret = -EINVAL; 1158 ret = -EINVAL;
1099 if ((nr_segs > UIO_MAXIOV) || (nr_segs <= 0))
1100 goto out;
1101 if (!file->f_op) 1159 if (!file->f_op)
1102 goto out; 1160 goto out;
1103 if (nr_segs > UIO_FASTIOV) { 1161
1104 ret = -ENOMEM;
1105 iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
1106 if (!iov)
1107 goto out;
1108 }
1109 ret = -EFAULT; 1162 ret = -EFAULT;
1110 if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector))) 1163 if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector)))
1111 goto out; 1164 goto out;
1112 1165
1113 /* 1166 tot_len = compat_rw_copy_check_uvector(type, uvector, nr_segs,
1114 * Single unix specification: 1167 UIO_FASTIOV, iovstack, &iov);
1115 * We should -EINVAL if an element length is not >= 0 and fitting an
1116 * ssize_t. The total length is fitting an ssize_t
1117 *
1118 * Be careful here because iov_len is a size_t not an ssize_t
1119 */
1120 tot_len = 0;
1121 vector = iov;
1122 ret = -EINVAL;
1123 for (seg = 0 ; seg < nr_segs; seg++) {
1124 compat_ssize_t tmp = tot_len;
1125 compat_ssize_t len;
1126 compat_uptr_t buf;
1127
1128 if (__get_user(len, &uvector->iov_len) ||
1129 __get_user(buf, &uvector->iov_base)) {
1130 ret = -EFAULT;
1131 goto out;
1132 }
1133 if (len < 0) /* size_t not fitting an compat_ssize_t .. */
1134 goto out;
1135 tot_len += len;
1136 if (tot_len < tmp) /* maths overflow on the compat_ssize_t */
1137 goto out;
1138 vector->iov_base = compat_ptr(buf);
1139 vector->iov_len = (compat_size_t) len;
1140 uvector++;
1141 vector++;
1142 }
1143 if (tot_len == 0) { 1168 if (tot_len == 0) {
1144 ret = 0; 1169 ret = 0;
1145 goto out; 1170 goto out;
@@ -1530,8 +1555,6 @@ int compat_do_execve(char * filename,
1530 if (retval < 0) 1555 if (retval < 0)
1531 goto out; 1556 goto out;
1532 1557
1533 current->stack_start = current->mm->start_stack;
1534
1535 /* execve succeeded */ 1558 /* execve succeeded */
1536 current->fs->in_exec = 0; 1559 current->fs->in_exec = 0;
1537 current->in_execve = 0; 1560 current->in_execve = 0;
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 6d55b61bfa79..641640dc7ae5 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -23,7 +23,6 @@
23#include <linux/ioctl.h> 23#include <linux/ioctl.h>
24#include <linux/if.h> 24#include <linux/if.h>
25#include <linux/if_bridge.h> 25#include <linux/if_bridge.h>
26#include <linux/slab.h>
27#include <linux/raid/md_u.h> 26#include <linux/raid/md_u.h>
28#include <linux/kd.h> 27#include <linux/kd.h>
29#include <linux/route.h> 28#include <linux/route.h>
@@ -60,6 +59,7 @@
60#include <linux/i2c.h> 59#include <linux/i2c.h>
61#include <linux/i2c-dev.h> 60#include <linux/i2c-dev.h>
62#include <linux/atalk.h> 61#include <linux/atalk.h>
62#include <linux/gfp.h>
63 63
64#include <net/bluetooth/bluetooth.h> 64#include <net/bluetooth/bluetooth.h>
65#include <net/bluetooth/hci.h> 65#include <net/bluetooth/hci.h>
@@ -102,7 +102,6 @@
102#include <linux/nbd.h> 102#include <linux/nbd.h>
103#include <linux/random.h> 103#include <linux/random.h>
104#include <linux/filter.h> 104#include <linux/filter.h>
105#include <linux/pktcdvd.h>
106 105
107#include <linux/hiddev.h> 106#include <linux/hiddev.h>
108 107
@@ -1126,8 +1125,6 @@ COMPATIBLE_IOCTL(PPGETMODE)
1126COMPATIBLE_IOCTL(PPGETPHASE) 1125COMPATIBLE_IOCTL(PPGETPHASE)
1127COMPATIBLE_IOCTL(PPGETFLAGS) 1126COMPATIBLE_IOCTL(PPGETFLAGS)
1128COMPATIBLE_IOCTL(PPSETFLAGS) 1127COMPATIBLE_IOCTL(PPSETFLAGS)
1129/* pktcdvd */
1130COMPATIBLE_IOCTL(PACKET_CTRL_CMD)
1131/* Big A */ 1128/* Big A */
1132/* sparc only */ 1129/* sparc only */
1133/* Big Q for sound/OSS */ 1130/* Big Q for sound/OSS */
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 8e48b52205aa..0b502f80c691 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -645,6 +645,7 @@ static void detach_groups(struct config_group *group)
645 645
646 configfs_detach_group(sd->s_element); 646 configfs_detach_group(sd->s_element);
647 child->d_inode->i_flags |= S_DEAD; 647 child->d_inode->i_flags |= S_DEAD;
648 dont_mount(child);
648 649
649 mutex_unlock(&child->d_inode->i_mutex); 650 mutex_unlock(&child->d_inode->i_mutex);
650 651
@@ -840,6 +841,7 @@ static int configfs_attach_item(struct config_item *parent_item,
840 mutex_lock(&dentry->d_inode->i_mutex); 841 mutex_lock(&dentry->d_inode->i_mutex);
841 configfs_remove_dir(item); 842 configfs_remove_dir(item);
842 dentry->d_inode->i_flags |= S_DEAD; 843 dentry->d_inode->i_flags |= S_DEAD;
844 dont_mount(dentry);
843 mutex_unlock(&dentry->d_inode->i_mutex); 845 mutex_unlock(&dentry->d_inode->i_mutex);
844 d_delete(dentry); 846 d_delete(dentry);
845 } 847 }
@@ -882,6 +884,7 @@ static int configfs_attach_group(struct config_item *parent_item,
882 if (ret) { 884 if (ret) {
883 configfs_detach_item(item); 885 configfs_detach_item(item);
884 dentry->d_inode->i_flags |= S_DEAD; 886 dentry->d_inode->i_flags |= S_DEAD;
887 dont_mount(dentry);
885 } 888 }
886 configfs_adjust_dir_dirent_depth_after_populate(sd); 889 configfs_adjust_dir_dirent_depth_after_populate(sd);
887 mutex_unlock(&dentry->d_inode->i_mutex); 890 mutex_unlock(&dentry->d_inode->i_mutex);
@@ -1725,6 +1728,7 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys)
1725 mutex_unlock(&configfs_symlink_mutex); 1728 mutex_unlock(&configfs_symlink_mutex);
1726 configfs_detach_group(&group->cg_item); 1729 configfs_detach_group(&group->cg_item);
1727 dentry->d_inode->i_flags |= S_DEAD; 1730 dentry->d_inode->i_flags |= S_DEAD;
1731 dont_mount(dentry);
1728 mutex_unlock(&dentry->d_inode->i_mutex); 1732 mutex_unlock(&dentry->d_inode->i_mutex);
1729 1733
1730 d_delete(dentry); 1734 d_delete(dentry);
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index a2f746066c5d..41645142b88b 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -34,6 +34,7 @@
34#include <linux/capability.h> 34#include <linux/capability.h>
35#include <linux/sched.h> 35#include <linux/sched.h>
36#include <linux/lockdep.h> 36#include <linux/lockdep.h>
37#include <linux/slab.h>
37 38
38#include <linux/configfs.h> 39#include <linux/configfs.h>
39#include "configfs_internal.h" 40#include "configfs_internal.h"
@@ -71,16 +72,11 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr)
71 if (!sd) 72 if (!sd)
72 return -EINVAL; 73 return -EINVAL;
73 74
74 sd_iattr = sd->s_iattr; 75 error = simple_setattr(dentry, iattr);
75
76 error = inode_change_ok(inode, iattr);
77 if (error)
78 return error;
79
80 error = inode_setattr(inode, iattr);
81 if (error) 76 if (error)
82 return error; 77 return error;
83 78
79 sd_iattr = sd->s_iattr;
84 if (!sd_iattr) { 80 if (!sd_iattr) {
85 /* setting attributes for the first time, allocate now */ 81 /* setting attributes for the first time, allocate now */
86 sd_iattr = kzalloc(sizeof(struct iattr), GFP_KERNEL); 82 sd_iattr = kzalloc(sizeof(struct iattr), GFP_KERNEL);
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index 8421cea7d8c7..8c8d64230c2d 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -29,6 +29,7 @@
29#include <linux/mount.h> 29#include <linux/mount.h>
30#include <linux/pagemap.h> 30#include <linux/pagemap.h>
31#include <linux/init.h> 31#include <linux/init.h>
32#include <linux/slab.h>
32 33
33#include <linux/configfs.h> 34#include <linux/configfs.h>
34#include "configfs_internal.h" 35#include "configfs_internal.h"
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index 32a5f46b1157..0f3eb41d9201 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -27,6 +27,7 @@
27#include <linux/fs.h> 27#include <linux/fs.h>
28#include <linux/module.h> 28#include <linux/module.h>
29#include <linux/namei.h> 29#include <linux/namei.h>
30#include <linux/slab.h>
30 31
31#include <linux/configfs.h> 32#include <linux/configfs.h>
32#include "configfs_internal.h" 33#include "configfs_internal.h"
diff --git a/fs/dcache.c b/fs/dcache.c
index f1358e5c3a59..d96047b4a633 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -536,7 +536,7 @@ restart:
536 */ 536 */
537static void prune_dcache(int count) 537static void prune_dcache(int count)
538{ 538{
539 struct super_block *sb; 539 struct super_block *sb, *n;
540 int w_count; 540 int w_count;
541 int unused = dentry_stat.nr_unused; 541 int unused = dentry_stat.nr_unused;
542 int prune_ratio; 542 int prune_ratio;
@@ -545,13 +545,14 @@ static void prune_dcache(int count)
545 if (unused == 0 || count == 0) 545 if (unused == 0 || count == 0)
546 return; 546 return;
547 spin_lock(&dcache_lock); 547 spin_lock(&dcache_lock);
548restart:
549 if (count >= unused) 548 if (count >= unused)
550 prune_ratio = 1; 549 prune_ratio = 1;
551 else 550 else
552 prune_ratio = unused / count; 551 prune_ratio = unused / count;
553 spin_lock(&sb_lock); 552 spin_lock(&sb_lock);
554 list_for_each_entry(sb, &super_blocks, s_list) { 553 list_for_each_entry_safe(sb, n, &super_blocks, s_list) {
554 if (list_empty(&sb->s_instances))
555 continue;
555 if (sb->s_nr_dentry_unused == 0) 556 if (sb->s_nr_dentry_unused == 0)
556 continue; 557 continue;
557 sb->s_count++; 558 sb->s_count++;
@@ -590,14 +591,10 @@ restart:
590 } 591 }
591 spin_lock(&sb_lock); 592 spin_lock(&sb_lock);
592 count -= pruned; 593 count -= pruned;
593 /* 594 __put_super(sb);
594 * restart only when sb is no longer on the list and 595 /* more work left to do? */
595 * we have more work to do. 596 if (count <= 0)
596 */ 597 break;
597 if (__put_super_and_need_restart(sb) && count > 0) {
598 spin_unlock(&sb_lock);
599 goto restart;
600 }
601 } 598 }
602 spin_unlock(&sb_lock); 599 spin_unlock(&sb_lock);
603 spin_unlock(&dcache_lock); 600 spin_unlock(&dcache_lock);
@@ -1529,6 +1526,7 @@ void d_delete(struct dentry * dentry)
1529 spin_lock(&dentry->d_lock); 1526 spin_lock(&dentry->d_lock);
1530 isdir = S_ISDIR(dentry->d_inode->i_mode); 1527 isdir = S_ISDIR(dentry->d_inode->i_mode);
1531 if (atomic_read(&dentry->d_count) == 1) { 1528 if (atomic_read(&dentry->d_count) == 1) {
1529 dentry->d_flags &= ~DCACHE_CANT_MOUNT;
1532 dentry_iput(dentry); 1530 dentry_iput(dentry);
1533 fsnotify_nameremove(dentry, isdir); 1531 fsnotify_nameremove(dentry, isdir);
1534 return; 1532 return;
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 4d74fc72c195..0210898458b2 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -277,8 +277,10 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_x32, debugfs_u32_get, debugfs_u32_set, "0x%08llx\n"
277DEFINE_SIMPLE_ATTRIBUTE(fops_x32_ro, debugfs_u32_get, NULL, "0x%08llx\n"); 277DEFINE_SIMPLE_ATTRIBUTE(fops_x32_ro, debugfs_u32_get, NULL, "0x%08llx\n");
278DEFINE_SIMPLE_ATTRIBUTE(fops_x32_wo, NULL, debugfs_u32_set, "0x%08llx\n"); 278DEFINE_SIMPLE_ATTRIBUTE(fops_x32_wo, NULL, debugfs_u32_set, "0x%08llx\n");
279 279
280DEFINE_SIMPLE_ATTRIBUTE(fops_x64, debugfs_u64_get, debugfs_u64_set, "0x%016llx\n");
281
280/* 282/*
281 * debugfs_create_x{8,16,32} - create a debugfs file that is used to read and write an unsigned {8,16,32}-bit value 283 * debugfs_create_x{8,16,32,64} - create a debugfs file that is used to read and write an unsigned {8,16,32,64}-bit value
282 * 284 *
283 * These functions are exactly the same as the above functions (but use a hex 285 * These functions are exactly the same as the above functions (but use a hex
284 * output for the decimal challenged). For details look at the above unsigned 286 * output for the decimal challenged). For details look at the above unsigned
@@ -357,6 +359,23 @@ struct dentry *debugfs_create_x32(const char *name, mode_t mode,
357} 359}
358EXPORT_SYMBOL_GPL(debugfs_create_x32); 360EXPORT_SYMBOL_GPL(debugfs_create_x32);
359 361
362/**
363 * debugfs_create_x64 - create a debugfs file that is used to read and write an unsigned 64-bit value
364 * @name: a pointer to a string containing the name of the file to create.
365 * @mode: the permission that the file should have
366 * @parent: a pointer to the parent dentry for this file. This should be a
367 * directory dentry if set. If this parameter is %NULL, then the
368 * file will be created in the root of the debugfs filesystem.
369 * @value: a pointer to the variable that the file should read to and write
370 * from.
371 */
372struct dentry *debugfs_create_x64(const char *name, mode_t mode,
373 struct dentry *parent, u64 *value)
374{
375 return debugfs_create_file(name, mode, parent, value, &fops_x64);
376}
377EXPORT_SYMBOL_GPL(debugfs_create_x64);
378
360 379
361static int debugfs_size_t_set(void *data, u64 val) 380static int debugfs_size_t_set(void *data, u64 val)
362{ 381{
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 049d6c36da09..30a87b3dbcac 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -27,6 +27,7 @@
27#include <linux/fsnotify.h> 27#include <linux/fsnotify.h>
28#include <linux/string.h> 28#include <linux/string.h>
29#include <linux/magic.h> 29#include <linux/magic.h>
30#include <linux/slab.h>
30 31
31static struct vfsmount *debugfs_mount; 32static struct vfsmount *debugfs_mount;
32static int debugfs_mount_count; 33static int debugfs_mount_count;
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 8882ecc0f1bf..8b3ffd5b5235 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -15,6 +15,7 @@
15#include <linux/fs.h> 15#include <linux/fs.h>
16#include <linux/sched.h> 16#include <linux/sched.h>
17#include <linux/namei.h> 17#include <linux/namei.h>
18#include <linux/slab.h>
18#include <linux/mount.h> 19#include <linux/mount.h>
19#include <linux/tty.h> 20#include <linux/tty.h>
20#include <linux/mutex.h> 21#include <linux/mutex.h>
@@ -383,18 +384,15 @@ static int devpts_get_sb(struct file_system_type *fs_type,
383 s->s_flags |= MS_ACTIVE; 384 s->s_flags |= MS_ACTIVE;
384 } 385 }
385 386
386 simple_set_mnt(mnt, s);
387
388 memcpy(&(DEVPTS_SB(s))->mount_opts, &opts, sizeof(opts)); 387 memcpy(&(DEVPTS_SB(s))->mount_opts, &opts, sizeof(opts));
389 388
390 error = mknod_ptmx(s); 389 error = mknod_ptmx(s);
391 if (error) 390 if (error)
392 goto out_dput; 391 goto out_undo_sget;
393 392
394 return 0; 393 simple_set_mnt(mnt, s);
395 394
396out_dput: 395 return 0;
397 dput(s->s_root); /* undo dget() in simple_set_mnt() */
398 396
399out_undo_sget: 397out_undo_sget:
400 deactivate_locked_super(s); 398 deactivate_locked_super(s);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index e82adc2debb7..7600aacf531d 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -82,6 +82,8 @@ struct dio {
82 int reap_counter; /* rate limit reaping */ 82 int reap_counter; /* rate limit reaping */
83 get_block_t *get_block; /* block mapping function */ 83 get_block_t *get_block; /* block mapping function */
84 dio_iodone_t *end_io; /* IO completion function */ 84 dio_iodone_t *end_io; /* IO completion function */
85 dio_submit_t *submit_io; /* IO submition function */
86 loff_t logical_offset_in_bio; /* current first logical block in bio */
85 sector_t final_block_in_bio; /* current final block in bio + 1 */ 87 sector_t final_block_in_bio; /* current final block in bio + 1 */
86 sector_t next_block_for_io; /* next block to be put under IO, 88 sector_t next_block_for_io; /* next block to be put under IO,
87 in dio_blocks units */ 89 in dio_blocks units */
@@ -96,6 +98,7 @@ struct dio {
96 unsigned cur_page_offset; /* Offset into it, in bytes */ 98 unsigned cur_page_offset; /* Offset into it, in bytes */
97 unsigned cur_page_len; /* Nr of bytes at cur_page_offset */ 99 unsigned cur_page_len; /* Nr of bytes at cur_page_offset */
98 sector_t cur_page_block; /* Where it starts */ 100 sector_t cur_page_block; /* Where it starts */
101 loff_t cur_page_fs_offset; /* Offset in file */
99 102
100 /* BIO completion state */ 103 /* BIO completion state */
101 spinlock_t bio_lock; /* protects BIO fields below */ 104 spinlock_t bio_lock; /* protects BIO fields below */
@@ -300,6 +303,26 @@ static void dio_bio_end_io(struct bio *bio, int error)
300 spin_unlock_irqrestore(&dio->bio_lock, flags); 303 spin_unlock_irqrestore(&dio->bio_lock, flags);
301} 304}
302 305
306/**
307 * dio_end_io - handle the end io action for the given bio
308 * @bio: The direct io bio thats being completed
309 * @error: Error if there was one
310 *
311 * This is meant to be called by any filesystem that uses their own dio_submit_t
312 * so that the DIO specific endio actions are dealt with after the filesystem
313 * has done it's completion work.
314 */
315void dio_end_io(struct bio *bio, int error)
316{
317 struct dio *dio = bio->bi_private;
318
319 if (dio->is_async)
320 dio_bio_end_aio(bio, error);
321 else
322 dio_bio_end_io(bio, error);
323}
324EXPORT_SYMBOL_GPL(dio_end_io);
325
303static int 326static int
304dio_bio_alloc(struct dio *dio, struct block_device *bdev, 327dio_bio_alloc(struct dio *dio, struct block_device *bdev,
305 sector_t first_sector, int nr_vecs) 328 sector_t first_sector, int nr_vecs)
@@ -316,6 +339,7 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev,
316 bio->bi_end_io = dio_bio_end_io; 339 bio->bi_end_io = dio_bio_end_io;
317 340
318 dio->bio = bio; 341 dio->bio = bio;
342 dio->logical_offset_in_bio = dio->cur_page_fs_offset;
319 return 0; 343 return 0;
320} 344}
321 345
@@ -340,10 +364,15 @@ static void dio_bio_submit(struct dio *dio)
340 if (dio->is_async && dio->rw == READ) 364 if (dio->is_async && dio->rw == READ)
341 bio_set_pages_dirty(bio); 365 bio_set_pages_dirty(bio);
342 366
343 submit_bio(dio->rw, bio); 367 if (dio->submit_io)
368 dio->submit_io(dio->rw, bio, dio->inode,
369 dio->logical_offset_in_bio);
370 else
371 submit_bio(dio->rw, bio);
344 372
345 dio->bio = NULL; 373 dio->bio = NULL;
346 dio->boundary = 0; 374 dio->boundary = 0;
375 dio->logical_offset_in_bio = 0;
347} 376}
348 377
349/* 378/*
@@ -603,10 +632,26 @@ static int dio_send_cur_page(struct dio *dio)
603 int ret = 0; 632 int ret = 0;
604 633
605 if (dio->bio) { 634 if (dio->bio) {
635 loff_t cur_offset = dio->block_in_file << dio->blkbits;
636 loff_t bio_next_offset = dio->logical_offset_in_bio +
637 dio->bio->bi_size;
638
606 /* 639 /*
607 * See whether this new request is contiguous with the old 640 * See whether this new request is contiguous with the old.
641 *
642 * Btrfs cannot handl having logically non-contiguous requests
643 * submitted. For exmple if you have
644 *
645 * Logical: [0-4095][HOLE][8192-12287]
646 * Phyiscal: [0-4095] [4096-8181]
647 *
648 * We cannot submit those pages together as one BIO. So if our
649 * current logical offset in the file does not equal what would
650 * be the next logical offset in the bio, submit the bio we
651 * have.
608 */ 652 */
609 if (dio->final_block_in_bio != dio->cur_page_block) 653 if (dio->final_block_in_bio != dio->cur_page_block ||
654 cur_offset != bio_next_offset)
610 dio_bio_submit(dio); 655 dio_bio_submit(dio);
611 /* 656 /*
612 * Submit now if the underlying fs is about to perform a 657 * Submit now if the underlying fs is about to perform a
@@ -701,6 +746,7 @@ submit_page_section(struct dio *dio, struct page *page,
701 dio->cur_page_offset = offset; 746 dio->cur_page_offset = offset;
702 dio->cur_page_len = len; 747 dio->cur_page_len = len;
703 dio->cur_page_block = blocknr; 748 dio->cur_page_block = blocknr;
749 dio->cur_page_fs_offset = dio->block_in_file << dio->blkbits;
704out: 750out:
705 return ret; 751 return ret;
706} 752}
@@ -935,7 +981,7 @@ static ssize_t
935direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, 981direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
936 const struct iovec *iov, loff_t offset, unsigned long nr_segs, 982 const struct iovec *iov, loff_t offset, unsigned long nr_segs,
937 unsigned blkbits, get_block_t get_block, dio_iodone_t end_io, 983 unsigned blkbits, get_block_t get_block, dio_iodone_t end_io,
938 struct dio *dio) 984 dio_submit_t submit_io, struct dio *dio)
939{ 985{
940 unsigned long user_addr; 986 unsigned long user_addr;
941 unsigned long flags; 987 unsigned long flags;
@@ -952,6 +998,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
952 998
953 dio->get_block = get_block; 999 dio->get_block = get_block;
954 dio->end_io = end_io; 1000 dio->end_io = end_io;
1001 dio->submit_io = submit_io;
955 dio->final_block_in_bio = -1; 1002 dio->final_block_in_bio = -1;
956 dio->next_block_for_io = -1; 1003 dio->next_block_for_io = -1;
957 1004
@@ -1008,7 +1055,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
1008 } 1055 }
1009 } /* end iovec loop */ 1056 } /* end iovec loop */
1010 1057
1011 if (ret == -ENOTBLK && (rw & WRITE)) { 1058 if (ret == -ENOTBLK) {
1012 /* 1059 /*
1013 * The remaining part of the request will be 1060 * The remaining part of the request will be
1014 * be handled by buffered I/O when we return 1061 * be handled by buffered I/O when we return
@@ -1087,30 +1134,11 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
1087 return ret; 1134 return ret;
1088} 1135}
1089 1136
1090/*
1091 * This is a library function for use by filesystem drivers.
1092 *
1093 * The locking rules are governed by the flags parameter:
1094 * - if the flags value contains DIO_LOCKING we use a fancy locking
1095 * scheme for dumb filesystems.
1096 * For writes this function is called under i_mutex and returns with
1097 * i_mutex held, for reads, i_mutex is not held on entry, but it is
1098 * taken and dropped again before returning.
1099 * For reads and writes i_alloc_sem is taken in shared mode and released
1100 * on I/O completion (which may happen asynchronously after returning to
1101 * the caller).
1102 *
1103 * - if the flags value does NOT contain DIO_LOCKING we don't use any
1104 * internal locking but rather rely on the filesystem to synchronize
1105 * direct I/O reads/writes versus each other and truncate.
1106 * For reads and writes both i_mutex and i_alloc_sem are not held on
1107 * entry and are never taken.
1108 */
1109ssize_t 1137ssize_t
1110__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, 1138__blockdev_direct_IO_newtrunc(int rw, struct kiocb *iocb, struct inode *inode,
1111 struct block_device *bdev, const struct iovec *iov, loff_t offset, 1139 struct block_device *bdev, const struct iovec *iov, loff_t offset,
1112 unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, 1140 unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
1113 int flags) 1141 dio_submit_t submit_io, int flags)
1114{ 1142{
1115 int seg; 1143 int seg;
1116 size_t size; 1144 size_t size;
@@ -1197,11 +1225,49 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1197 (end > i_size_read(inode))); 1225 (end > i_size_read(inode)));
1198 1226
1199 retval = direct_io_worker(rw, iocb, inode, iov, offset, 1227 retval = direct_io_worker(rw, iocb, inode, iov, offset,
1200 nr_segs, blkbits, get_block, end_io, dio); 1228 nr_segs, blkbits, get_block, end_io,
1229 submit_io, dio);
1230
1231out:
1232 return retval;
1233}
1234EXPORT_SYMBOL(__blockdev_direct_IO_newtrunc);
1235
1236/*
1237 * This is a library function for use by filesystem drivers.
1238 *
1239 * The locking rules are governed by the flags parameter:
1240 * - if the flags value contains DIO_LOCKING we use a fancy locking
1241 * scheme for dumb filesystems.
1242 * For writes this function is called under i_mutex and returns with
1243 * i_mutex held, for reads, i_mutex is not held on entry, but it is
1244 * taken and dropped again before returning.
1245 * For reads and writes i_alloc_sem is taken in shared mode and released
1246 * on I/O completion (which may happen asynchronously after returning to
1247 * the caller).
1248 *
1249 * - if the flags value does NOT contain DIO_LOCKING we don't use any
1250 * internal locking but rather rely on the filesystem to synchronize
1251 * direct I/O reads/writes versus each other and truncate.
1252 * For reads and writes both i_mutex and i_alloc_sem are not held on
1253 * entry and are never taken.
1254 */
1255ssize_t
1256__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1257 struct block_device *bdev, const struct iovec *iov, loff_t offset,
1258 unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
1259 dio_submit_t submit_io, int flags)
1260{
1261 ssize_t retval;
1201 1262
1263 retval = __blockdev_direct_IO_newtrunc(rw, iocb, inode, bdev, iov,
1264 offset, nr_segs, get_block, end_io, submit_io, flags);
1202 /* 1265 /*
1203 * In case of error extending write may have instantiated a few 1266 * In case of error extending write may have instantiated a few
1204 * blocks outside i_size. Trim these off again for DIO_LOCKING. 1267 * blocks outside i_size. Trim these off again for DIO_LOCKING.
1268 * NOTE: DIO_NO_LOCK/DIO_OWN_LOCK callers have to handle this in
1269 * their own manner. This is a further example of where the old
1270 * truncate sequence is inadequate.
1205 * 1271 *
1206 * NOTE: filesystems with their own locking have to handle this 1272 * NOTE: filesystems with their own locking have to handle this
1207 * on their own. 1273 * on their own.
@@ -1209,12 +1275,13 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1209 if (flags & DIO_LOCKING) { 1275 if (flags & DIO_LOCKING) {
1210 if (unlikely((rw & WRITE) && retval < 0)) { 1276 if (unlikely((rw & WRITE) && retval < 0)) {
1211 loff_t isize = i_size_read(inode); 1277 loff_t isize = i_size_read(inode);
1278 loff_t end = offset + iov_length(iov, nr_segs);
1279
1212 if (end > isize) 1280 if (end > isize)
1213 vmtruncate(inode, isize); 1281 vmtruncate(inode, isize);
1214 } 1282 }
1215 } 1283 }
1216 1284
1217out:
1218 return retval; 1285 return retval;
1219} 1286}
1220EXPORT_SYMBOL(__blockdev_direct_IO); 1287EXPORT_SYMBOL(__blockdev_direct_IO);
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 0df243850818..b54bca03d92f 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -14,6 +14,7 @@
14#include <linux/kernel.h> 14#include <linux/kernel.h>
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/configfs.h> 16#include <linux/configfs.h>
17#include <linux/slab.h>
17#include <linux/in.h> 18#include <linux/in.h>
18#include <linux/in6.h> 19#include <linux/in6.h>
19#include <net/ipv6.h> 20#include <net/ipv6.h>
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 29d6139c35fc..c6cf25158746 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -15,6 +15,7 @@
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/ctype.h> 16#include <linux/ctype.h>
17#include <linux/debugfs.h> 17#include <linux/debugfs.h>
18#include <linux/slab.h>
18 19
19#include "dlm_internal.h" 20#include "dlm_internal.h"
20#include "lock.h" 21#include "lock.h"
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 46ffd3eeaaf7..031dbe3a15ca 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -56,6 +56,7 @@
56 L: receive_xxxx_reply() <- R: send_xxxx_reply() 56 L: receive_xxxx_reply() <- R: send_xxxx_reply()
57*/ 57*/
58#include <linux/types.h> 58#include <linux/types.h>
59#include <linux/slab.h>
59#include "dlm_internal.h" 60#include "dlm_internal.h"
60#include <linux/dlm_device.h> 61#include <linux/dlm_device.h>
61#include "memory.h" 62#include "memory.h"
@@ -732,10 +733,7 @@ static void lkb_add_ordered(struct list_head *new, struct list_head *head,
732 if (lkb->lkb_rqmode < mode) 733 if (lkb->lkb_rqmode < mode)
733 break; 734 break;
734 735
735 if (!lkb) 736 __list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue);
736 list_add_tail(new, head);
737 else
738 __list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue);
739} 737}
740 738
741/* add/remove lkb to rsb's grant/convert/wait queue */ 739/* add/remove lkb to rsb's grant/convert/wait queue */
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 52cab160893c..c0d35c620526 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -51,6 +51,7 @@
51#include <linux/file.h> 51#include <linux/file.h>
52#include <linux/mutex.h> 52#include <linux/mutex.h>
53#include <linux/sctp.h> 53#include <linux/sctp.h>
54#include <linux/slab.h>
54#include <net/sctp/user.h> 55#include <net/sctp/user.h>
55#include <net/ipv6.h> 56#include <net/ipv6.h>
56 57
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index 052095cd592f..2c6ad518100d 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -9,6 +9,7 @@
9#include <net/genetlink.h> 9#include <net/genetlink.h>
10#include <linux/dlm.h> 10#include <linux/dlm.h>
11#include <linux/dlm_netlink.h> 11#include <linux/dlm_netlink.h>
12#include <linux/gfp.h>
12 13
13#include "dlm_internal.h" 14#include "dlm_internal.h"
14 15
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index b5f89aef3b29..d45c02db6943 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -11,6 +11,7 @@
11#include <linux/poll.h> 11#include <linux/poll.h>
12#include <linux/dlm.h> 12#include <linux/dlm.h>
13#include <linux/dlm_plock.h> 13#include <linux/dlm_plock.h>
14#include <linux/slab.h>
14 15
15#include "dlm_internal.h" 16#include "dlm_internal.h"
16#include "lockspace.h" 17#include "lockspace.h"
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index a4bfd31ac45b..b6272853130c 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -17,6 +17,7 @@
17#include <linux/spinlock.h> 17#include <linux/spinlock.h>
18#include <linux/dlm.h> 18#include <linux/dlm.h>
19#include <linux/dlm_device.h> 19#include <linux/dlm_device.h>
20#include <linux/slab.h>
20 21
21#include "dlm_internal.h" 22#include "dlm_internal.h"
22#include "lockspace.h" 23#include "lockspace.h"
@@ -214,6 +215,7 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int mode)
214 if (!ast_type) { 215 if (!ast_type) {
215 kref_get(&lkb->lkb_ref); 216 kref_get(&lkb->lkb_ref);
216 list_add_tail(&lkb->lkb_astqueue, &proc->asts); 217 list_add_tail(&lkb->lkb_astqueue, &proc->asts);
218 lkb->lkb_ast_first = type;
217 wake_up_interruptible(&proc->wait); 219 wake_up_interruptible(&proc->wait);
218 } 220 }
219 if (type == AST_COMP && (ast_type & AST_COMP)) 221 if (type == AST_COMP && (ast_type & AST_COMP))
@@ -222,7 +224,6 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int mode)
222 224
223 eol = lkb_is_endoflife(lkb, ua->lksb.sb_status, type); 225 eol = lkb_is_endoflife(lkb, ua->lksb.sb_status, type);
224 if (eol) { 226 if (eol) {
225 lkb->lkb_ast_type &= ~AST_BAST;
226 lkb->lkb_flags |= DLM_IFL_ENDOFLIFE; 227 lkb->lkb_flags |= DLM_IFL_ENDOFLIFE;
227 } 228 }
228 229
@@ -705,7 +706,7 @@ static int device_close(struct inode *inode, struct file *file)
705} 706}
706 707
707static int copy_result_to_user(struct dlm_user_args *ua, int compat, int type, 708static int copy_result_to_user(struct dlm_user_args *ua, int compat, int type,
708 int bmode, char __user *buf, size_t count) 709 int mode, char __user *buf, size_t count)
709{ 710{
710#ifdef CONFIG_COMPAT 711#ifdef CONFIG_COMPAT
711 struct dlm_lock_result32 result32; 712 struct dlm_lock_result32 result32;
@@ -732,7 +733,7 @@ static int copy_result_to_user(struct dlm_user_args *ua, int compat, int type,
732 if (type == AST_BAST) { 733 if (type == AST_BAST) {
733 result.user_astaddr = ua->bastaddr; 734 result.user_astaddr = ua->bastaddr;
734 result.user_astparam = ua->bastparam; 735 result.user_astparam = ua->bastparam;
735 result.bast_mode = bmode; 736 result.bast_mode = mode;
736 } else { 737 } else {
737 result.user_astaddr = ua->castaddr; 738 result.user_astaddr = ua->castaddr;
738 result.user_astparam = ua->castparam; 739 result.user_astparam = ua->castparam;
@@ -800,7 +801,9 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
800 struct dlm_user_proc *proc = file->private_data; 801 struct dlm_user_proc *proc = file->private_data;
801 struct dlm_lkb *lkb; 802 struct dlm_lkb *lkb;
802 DECLARE_WAITQUEUE(wait, current); 803 DECLARE_WAITQUEUE(wait, current);
803 int error, type=0, bmode=0, removed = 0; 804 int error = 0, removed;
805 int ret_type, ret_mode;
806 int bastmode, castmode, do_bast, do_cast;
804 807
805 if (count == sizeof(struct dlm_device_version)) { 808 if (count == sizeof(struct dlm_device_version)) {
806 error = copy_version_to_user(buf, count); 809 error = copy_version_to_user(buf, count);
@@ -819,6 +822,8 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
819#endif 822#endif
820 return -EINVAL; 823 return -EINVAL;
821 824
825 try_another:
826
822 /* do we really need this? can a read happen after a close? */ 827 /* do we really need this? can a read happen after a close? */
823 if (test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags)) 828 if (test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags))
824 return -EINVAL; 829 return -EINVAL;
@@ -854,13 +859,55 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
854 859
855 lkb = list_entry(proc->asts.next, struct dlm_lkb, lkb_astqueue); 860 lkb = list_entry(proc->asts.next, struct dlm_lkb, lkb_astqueue);
856 861
857 if (lkb->lkb_ast_type & AST_COMP) { 862 removed = 0;
858 lkb->lkb_ast_type &= ~AST_COMP; 863 ret_type = 0;
859 type = AST_COMP; 864 ret_mode = 0;
860 } else if (lkb->lkb_ast_type & AST_BAST) { 865 do_bast = lkb->lkb_ast_type & AST_BAST;
861 lkb->lkb_ast_type &= ~AST_BAST; 866 do_cast = lkb->lkb_ast_type & AST_COMP;
862 type = AST_BAST; 867 bastmode = lkb->lkb_bastmode;
863 bmode = lkb->lkb_bastmode; 868 castmode = lkb->lkb_castmode;
869
870 /* when both are queued figure out which to do first and
871 switch first so the other goes in the next read */
872
873 if (do_cast && do_bast) {
874 if (lkb->lkb_ast_first == AST_COMP) {
875 ret_type = AST_COMP;
876 ret_mode = castmode;
877 lkb->lkb_ast_type &= ~AST_COMP;
878 lkb->lkb_ast_first = AST_BAST;
879 } else {
880 ret_type = AST_BAST;
881 ret_mode = bastmode;
882 lkb->lkb_ast_type &= ~AST_BAST;
883 lkb->lkb_ast_first = AST_COMP;
884 }
885 } else {
886 ret_type = lkb->lkb_ast_first;
887 ret_mode = (ret_type == AST_COMP) ? castmode : bastmode;
888 lkb->lkb_ast_type &= ~ret_type;
889 lkb->lkb_ast_first = 0;
890 }
891
892 /* if we're doing a bast but the bast is unnecessary, then
893 switch to do nothing or do a cast if that was needed next */
894
895 if ((ret_type == AST_BAST) &&
896 dlm_modes_compat(bastmode, lkb->lkb_castmode_done)) {
897 ret_type = 0;
898 ret_mode = 0;
899
900 if (do_cast) {
901 ret_type = AST_COMP;
902 ret_mode = castmode;
903 lkb->lkb_ast_type &= ~AST_COMP;
904 lkb->lkb_ast_first = 0;
905 }
906 }
907
908 if (lkb->lkb_ast_first != lkb->lkb_ast_type) {
909 log_print("device_read %x ast_first %x ast_type %x",
910 lkb->lkb_id, lkb->lkb_ast_first, lkb->lkb_ast_type);
864 } 911 }
865 912
866 if (!lkb->lkb_ast_type) { 913 if (!lkb->lkb_ast_type) {
@@ -869,15 +916,29 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
869 } 916 }
870 spin_unlock(&proc->asts_spin); 917 spin_unlock(&proc->asts_spin);
871 918
872 error = copy_result_to_user(lkb->lkb_ua, 919 if (ret_type) {
873 test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags), 920 error = copy_result_to_user(lkb->lkb_ua,
874 type, bmode, buf, count); 921 test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags),
922 ret_type, ret_mode, buf, count);
923
924 if (ret_type == AST_COMP)
925 lkb->lkb_castmode_done = castmode;
926 if (ret_type == AST_BAST)
927 lkb->lkb_bastmode_done = bastmode;
928 }
875 929
876 /* removes reference for the proc->asts lists added by 930 /* removes reference for the proc->asts lists added by
877 dlm_user_add_ast() and may result in the lkb being freed */ 931 dlm_user_add_ast() and may result in the lkb being freed */
932
878 if (removed) 933 if (removed)
879 dlm_put_lkb(lkb); 934 dlm_put_lkb(lkb);
880 935
936 /* the bast that was queued was eliminated (see unnecessary above),
937 leaving nothing to return */
938
939 if (!ret_type)
940 goto try_another;
941
881 return error; 942 return error;
882} 943}
883 944
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 31f4b0e6d72c..83c4f600786a 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -12,7 +12,7 @@
12/* A global variable is a bit ugly, but it keeps the code simple */ 12/* A global variable is a bit ugly, but it keeps the code simple */
13int sysctl_drop_caches; 13int sysctl_drop_caches;
14 14
15static void drop_pagecache_sb(struct super_block *sb) 15static void drop_pagecache_sb(struct super_block *sb, void *unused)
16{ 16{
17 struct inode *inode, *toput_inode = NULL; 17 struct inode *inode, *toput_inode = NULL;
18 18
@@ -33,26 +33,6 @@ static void drop_pagecache_sb(struct super_block *sb)
33 iput(toput_inode); 33 iput(toput_inode);
34} 34}
35 35
36static void drop_pagecache(void)
37{
38 struct super_block *sb;
39
40 spin_lock(&sb_lock);
41restart:
42 list_for_each_entry(sb, &super_blocks, s_list) {
43 sb->s_count++;
44 spin_unlock(&sb_lock);
45 down_read(&sb->s_umount);
46 if (sb->s_root)
47 drop_pagecache_sb(sb);
48 up_read(&sb->s_umount);
49 spin_lock(&sb_lock);
50 if (__put_super_and_need_restart(sb))
51 goto restart;
52 }
53 spin_unlock(&sb_lock);
54}
55
56static void drop_slab(void) 36static void drop_slab(void)
57{ 37{
58 int nr_objects; 38 int nr_objects;
@@ -68,7 +48,7 @@ int drop_caches_sysctl_handler(ctl_table *table, int write,
68 proc_dointvec_minmax(table, write, buffer, length, ppos); 48 proc_dointvec_minmax(table, write, buffer, length, ppos);
69 if (write) { 49 if (write) {
70 if (sysctl_drop_caches & 1) 50 if (sysctl_drop_caches & 1)
71 drop_pagecache(); 51 iterate_supers(drop_pagecache_sb, NULL);
72 if (sysctl_drop_caches & 2) 52 if (sysctl_drop_caches & 2)
73 drop_slab(); 53 drop_slab();
74 } 54 }
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 7cb0a59f4b9d..1cc087635a5e 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -33,6 +33,7 @@
33#include <linux/crypto.h> 33#include <linux/crypto.h>
34#include <linux/file.h> 34#include <linux/file.h>
35#include <linux/scatterlist.h> 35#include <linux/scatterlist.h>
36#include <linux/slab.h>
36#include <asm/unaligned.h> 37#include <asm/unaligned.h>
37#include "ecryptfs_kernel.h" 38#include "ecryptfs_kernel.h"
38 39
@@ -381,8 +382,8 @@ out:
381static void ecryptfs_lower_offset_for_extent(loff_t *offset, loff_t extent_num, 382static void ecryptfs_lower_offset_for_extent(loff_t *offset, loff_t extent_num,
382 struct ecryptfs_crypt_stat *crypt_stat) 383 struct ecryptfs_crypt_stat *crypt_stat)
383{ 384{
384 (*offset) = (crypt_stat->num_header_bytes_at_front 385 (*offset) = ecryptfs_lower_header_size(crypt_stat)
385 + (crypt_stat->extent_size * extent_num)); 386 + (crypt_stat->extent_size * extent_num);
386} 387}
387 388
388/** 389/**
@@ -834,13 +835,13 @@ void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat)
834 set_extent_mask_and_shift(crypt_stat); 835 set_extent_mask_and_shift(crypt_stat);
835 crypt_stat->iv_bytes = ECRYPTFS_DEFAULT_IV_BYTES; 836 crypt_stat->iv_bytes = ECRYPTFS_DEFAULT_IV_BYTES;
836 if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) 837 if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
837 crypt_stat->num_header_bytes_at_front = 0; 838 crypt_stat->metadata_size = ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE;
838 else { 839 else {
839 if (PAGE_CACHE_SIZE <= ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE) 840 if (PAGE_CACHE_SIZE <= ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE)
840 crypt_stat->num_header_bytes_at_front = 841 crypt_stat->metadata_size =
841 ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE; 842 ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE;
842 else 843 else
843 crypt_stat->num_header_bytes_at_front = PAGE_CACHE_SIZE; 844 crypt_stat->metadata_size = PAGE_CACHE_SIZE;
844 } 845 }
845} 846}
846 847
@@ -1107,9 +1108,9 @@ static void write_ecryptfs_marker(char *page_virt, size_t *written)
1107 (*written) = MAGIC_ECRYPTFS_MARKER_SIZE_BYTES; 1108 (*written) = MAGIC_ECRYPTFS_MARKER_SIZE_BYTES;
1108} 1109}
1109 1110
1110static void 1111void ecryptfs_write_crypt_stat_flags(char *page_virt,
1111write_ecryptfs_flags(char *page_virt, struct ecryptfs_crypt_stat *crypt_stat, 1112 struct ecryptfs_crypt_stat *crypt_stat,
1112 size_t *written) 1113 size_t *written)
1113{ 1114{
1114 u32 flags = 0; 1115 u32 flags = 0;
1115 int i; 1116 int i;
@@ -1237,8 +1238,7 @@ ecryptfs_write_header_metadata(char *virt,
1237 1238
1238 header_extent_size = (u32)crypt_stat->extent_size; 1239 header_extent_size = (u32)crypt_stat->extent_size;
1239 num_header_extents_at_front = 1240 num_header_extents_at_front =
1240 (u16)(crypt_stat->num_header_bytes_at_front 1241 (u16)(crypt_stat->metadata_size / crypt_stat->extent_size);
1241 / crypt_stat->extent_size);
1242 put_unaligned_be32(header_extent_size, virt); 1242 put_unaligned_be32(header_extent_size, virt);
1243 virt += 4; 1243 virt += 4;
1244 put_unaligned_be16(num_header_extents_at_front, virt); 1244 put_unaligned_be16(num_header_extents_at_front, virt);
@@ -1291,7 +1291,8 @@ static int ecryptfs_write_headers_virt(char *page_virt, size_t max,
1291 offset = ECRYPTFS_FILE_SIZE_BYTES; 1291 offset = ECRYPTFS_FILE_SIZE_BYTES;
1292 write_ecryptfs_marker((page_virt + offset), &written); 1292 write_ecryptfs_marker((page_virt + offset), &written);
1293 offset += written; 1293 offset += written;
1294 write_ecryptfs_flags((page_virt + offset), crypt_stat, &written); 1294 ecryptfs_write_crypt_stat_flags((page_virt + offset), crypt_stat,
1295 &written);
1295 offset += written; 1296 offset += written;
1296 ecryptfs_write_header_metadata((page_virt + offset), crypt_stat, 1297 ecryptfs_write_header_metadata((page_virt + offset), crypt_stat,
1297 &written); 1298 &written);
@@ -1381,7 +1382,7 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry)
1381 rc = -EINVAL; 1382 rc = -EINVAL;
1382 goto out; 1383 goto out;
1383 } 1384 }
1384 virt_len = crypt_stat->num_header_bytes_at_front; 1385 virt_len = crypt_stat->metadata_size;
1385 order = get_order(virt_len); 1386 order = get_order(virt_len);
1386 /* Released in this function */ 1387 /* Released in this function */
1387 virt = (char *)ecryptfs_get_zeroed_pages(GFP_KERNEL, order); 1388 virt = (char *)ecryptfs_get_zeroed_pages(GFP_KERNEL, order);
@@ -1427,16 +1428,15 @@ static int parse_header_metadata(struct ecryptfs_crypt_stat *crypt_stat,
1427 header_extent_size = get_unaligned_be32(virt); 1428 header_extent_size = get_unaligned_be32(virt);
1428 virt += sizeof(__be32); 1429 virt += sizeof(__be32);
1429 num_header_extents_at_front = get_unaligned_be16(virt); 1430 num_header_extents_at_front = get_unaligned_be16(virt);
1430 crypt_stat->num_header_bytes_at_front = 1431 crypt_stat->metadata_size = (((size_t)num_header_extents_at_front
1431 (((size_t)num_header_extents_at_front 1432 * (size_t)header_extent_size));
1432 * (size_t)header_extent_size));
1433 (*bytes_read) = (sizeof(__be32) + sizeof(__be16)); 1433 (*bytes_read) = (sizeof(__be32) + sizeof(__be16));
1434 if ((validate_header_size == ECRYPTFS_VALIDATE_HEADER_SIZE) 1434 if ((validate_header_size == ECRYPTFS_VALIDATE_HEADER_SIZE)
1435 && (crypt_stat->num_header_bytes_at_front 1435 && (crypt_stat->metadata_size
1436 < ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE)) { 1436 < ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE)) {
1437 rc = -EINVAL; 1437 rc = -EINVAL;
1438 printk(KERN_WARNING "Invalid header size: [%zd]\n", 1438 printk(KERN_WARNING "Invalid header size: [%zd]\n",
1439 crypt_stat->num_header_bytes_at_front); 1439 crypt_stat->metadata_size);
1440 } 1440 }
1441 return rc; 1441 return rc;
1442} 1442}
@@ -1451,8 +1451,7 @@ static int parse_header_metadata(struct ecryptfs_crypt_stat *crypt_stat,
1451 */ 1451 */
1452static void set_default_header_data(struct ecryptfs_crypt_stat *crypt_stat) 1452static void set_default_header_data(struct ecryptfs_crypt_stat *crypt_stat)
1453{ 1453{
1454 crypt_stat->num_header_bytes_at_front = 1454 crypt_stat->metadata_size = ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE;
1455 ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE;
1456} 1455}
1457 1456
1458/** 1457/**
@@ -1606,6 +1605,7 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
1606 ecryptfs_dentry, 1605 ecryptfs_dentry,
1607 ECRYPTFS_VALIDATE_HEADER_SIZE); 1606 ECRYPTFS_VALIDATE_HEADER_SIZE);
1608 if (rc) { 1607 if (rc) {
1608 memset(page_virt, 0, PAGE_CACHE_SIZE);
1609 rc = ecryptfs_read_xattr_region(page_virt, ecryptfs_inode); 1609 rc = ecryptfs_read_xattr_region(page_virt, ecryptfs_inode);
1610 if (rc) { 1610 if (rc) {
1611 printk(KERN_DEBUG "Valid eCryptfs headers not found in " 1611 printk(KERN_DEBUG "Valid eCryptfs headers not found in "
diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c
index 8f006a0d6076..906e803f7f79 100644
--- a/fs/ecryptfs/dentry.c
+++ b/fs/ecryptfs/dentry.c
@@ -26,6 +26,7 @@
26#include <linux/namei.h> 26#include <linux/namei.h>
27#include <linux/mount.h> 27#include <linux/mount.h>
28#include <linux/fs_stack.h> 28#include <linux/fs_stack.h>
29#include <linux/slab.h>
29#include "ecryptfs_kernel.h" 30#include "ecryptfs_kernel.h"
30 31
31/** 32/**
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 542f625312f3..0032a9f5a3a9 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -35,6 +35,7 @@
35#include <linux/scatterlist.h> 35#include <linux/scatterlist.h>
36#include <linux/hash.h> 36#include <linux/hash.h>
37#include <linux/nsproxy.h> 37#include <linux/nsproxy.h>
38#include <linux/backing-dev.h>
38 39
39/* Version verification for shared data structures w/ userspace */ 40/* Version verification for shared data structures w/ userspace */
40#define ECRYPTFS_VERSION_MAJOR 0x00 41#define ECRYPTFS_VERSION_MAJOR 0x00
@@ -273,7 +274,7 @@ struct ecryptfs_crypt_stat {
273 u32 flags; 274 u32 flags;
274 unsigned int file_version; 275 unsigned int file_version;
275 size_t iv_bytes; 276 size_t iv_bytes;
276 size_t num_header_bytes_at_front; 277 size_t metadata_size;
277 size_t extent_size; /* Data extent size; default is 4096 */ 278 size_t extent_size; /* Data extent size; default is 4096 */
278 size_t key_size; 279 size_t key_size;
279 size_t extent_shift; 280 size_t extent_shift;
@@ -393,6 +394,7 @@ struct ecryptfs_mount_crypt_stat {
393struct ecryptfs_sb_info { 394struct ecryptfs_sb_info {
394 struct super_block *wsi_sb; 395 struct super_block *wsi_sb;
395 struct ecryptfs_mount_crypt_stat mount_crypt_stat; 396 struct ecryptfs_mount_crypt_stat mount_crypt_stat;
397 struct backing_dev_info bdi;
396}; 398};
397 399
398/* file private data. */ 400/* file private data. */
@@ -464,6 +466,14 @@ struct ecryptfs_daemon {
464 466
465extern struct mutex ecryptfs_daemon_hash_mux; 467extern struct mutex ecryptfs_daemon_hash_mux;
466 468
469static inline size_t
470ecryptfs_lower_header_size(struct ecryptfs_crypt_stat *crypt_stat)
471{
472 if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
473 return 0;
474 return crypt_stat->metadata_size;
475}
476
467static inline struct ecryptfs_file_info * 477static inline struct ecryptfs_file_info *
468ecryptfs_file_to_private(struct file *file) 478ecryptfs_file_to_private(struct file *file)
469{ 479{
@@ -651,6 +661,9 @@ int ecryptfs_decrypt_page(struct page *page);
651int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry); 661int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry);
652int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry); 662int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry);
653int ecryptfs_new_file_context(struct dentry *ecryptfs_dentry); 663int ecryptfs_new_file_context(struct dentry *ecryptfs_dentry);
664void ecryptfs_write_crypt_stat_flags(char *page_virt,
665 struct ecryptfs_crypt_stat *crypt_stat,
666 size_t *written);
654int ecryptfs_read_and_validate_header_region(char *data, 667int ecryptfs_read_and_validate_header_region(char *data,
655 struct inode *ecryptfs_inode); 668 struct inode *ecryptfs_inode);
656int ecryptfs_read_and_validate_xattr_region(char *page_virt, 669int ecryptfs_read_and_validate_xattr_region(char *page_virt,
@@ -718,15 +731,14 @@ int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data,
718int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode, 731int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode,
719 struct page *page_for_lower, 732 struct page *page_for_lower,
720 size_t offset_in_page, size_t size); 733 size_t offset_in_page, size_t size);
721int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset, 734int ecryptfs_write(struct inode *inode, char *data, loff_t offset, size_t size);
722 size_t size);
723int ecryptfs_read_lower(char *data, loff_t offset, size_t size, 735int ecryptfs_read_lower(char *data, loff_t offset, size_t size,
724 struct inode *ecryptfs_inode); 736 struct inode *ecryptfs_inode);
725int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs, 737int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs,
726 pgoff_t page_index, 738 pgoff_t page_index,
727 size_t offset_in_page, size_t size, 739 size_t offset_in_page, size_t size,
728 struct inode *ecryptfs_inode); 740 struct inode *ecryptfs_inode);
729struct page *ecryptfs_get_locked_page(struct file *file, loff_t index); 741struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index);
730int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon); 742int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon);
731int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon, uid_t euid, 743int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon, uid_t euid,
732 struct user_namespace *user_ns); 744 struct user_namespace *user_ns);
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 678172b61be2..e8fcf4e2ed7d 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -25,6 +25,7 @@
25 25
26#include <linux/file.h> 26#include <linux/file.h>
27#include <linux/poll.h> 27#include <linux/poll.h>
28#include <linux/slab.h>
28#include <linux/mount.h> 29#include <linux/mount.h>
29#include <linux/pagemap.h> 30#include <linux/pagemap.h>
30#include <linux/security.h> 31#include <linux/security.h>
@@ -273,11 +274,9 @@ static int ecryptfs_release(struct inode *inode, struct file *file)
273} 274}
274 275
275static int 276static int
276ecryptfs_fsync(struct file *file, struct dentry *dentry, int datasync) 277ecryptfs_fsync(struct file *file, int datasync)
277{ 278{
278 return vfs_fsync(ecryptfs_file_to_lower(file), 279 return vfs_fsync(ecryptfs_file_to_lower(file), datasync);
279 ecryptfs_dentry_to_lower(dentry),
280 datasync);
281} 280}
282 281
283static int ecryptfs_fasync(int fd, struct file *file, int flag) 282static int ecryptfs_fasync(int fd, struct file *file, int flag)
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 4a430ab4115c..31ef5252f0fe 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -31,6 +31,7 @@
31#include <linux/mount.h> 31#include <linux/mount.h>
32#include <linux/crypto.h> 32#include <linux/crypto.h>
33#include <linux/fs_stack.h> 33#include <linux/fs_stack.h>
34#include <linux/slab.h>
34#include <asm/unaligned.h> 35#include <asm/unaligned.h>
35#include "ecryptfs_kernel.h" 36#include "ecryptfs_kernel.h"
36 37
@@ -141,19 +142,10 @@ out:
141static int grow_file(struct dentry *ecryptfs_dentry) 142static int grow_file(struct dentry *ecryptfs_dentry)
142{ 143{
143 struct inode *ecryptfs_inode = ecryptfs_dentry->d_inode; 144 struct inode *ecryptfs_inode = ecryptfs_dentry->d_inode;
144 struct file fake_file;
145 struct ecryptfs_file_info tmp_file_info;
146 char zero_virt[] = { 0x00 }; 145 char zero_virt[] = { 0x00 };
147 int rc = 0; 146 int rc = 0;
148 147
149 memset(&fake_file, 0, sizeof(fake_file)); 148 rc = ecryptfs_write(ecryptfs_inode, zero_virt, 0, 1);
150 fake_file.f_path.dentry = ecryptfs_dentry;
151 memset(&tmp_file_info, 0, sizeof(tmp_file_info));
152 ecryptfs_set_file_private(&fake_file, &tmp_file_info);
153 ecryptfs_set_file_lower(
154 &fake_file,
155 ecryptfs_inode_to_private(ecryptfs_inode)->lower_file);
156 rc = ecryptfs_write(&fake_file, zero_virt, 0, 1);
157 i_size_write(ecryptfs_inode, 0); 149 i_size_write(ecryptfs_inode, 0);
158 rc = ecryptfs_write_inode_size_to_metadata(ecryptfs_inode); 150 rc = ecryptfs_write_inode_size_to_metadata(ecryptfs_inode);
159 ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat.flags |= 151 ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat.flags |=
@@ -323,6 +315,7 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
323 rc = ecryptfs_read_and_validate_header_region(page_virt, 315 rc = ecryptfs_read_and_validate_header_region(page_virt,
324 ecryptfs_dentry->d_inode); 316 ecryptfs_dentry->d_inode);
325 if (rc) { 317 if (rc) {
318 memset(page_virt, 0, PAGE_CACHE_SIZE);
326 rc = ecryptfs_read_and_validate_xattr_region(page_virt, 319 rc = ecryptfs_read_and_validate_xattr_region(page_virt,
327 ecryptfs_dentry); 320 ecryptfs_dentry);
328 if (rc) { 321 if (rc) {
@@ -335,7 +328,7 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
335 ecryptfs_dentry->d_sb)->mount_crypt_stat; 328 ecryptfs_dentry->d_sb)->mount_crypt_stat;
336 if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) { 329 if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) {
337 if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) 330 if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
338 file_size = (crypt_stat->num_header_bytes_at_front 331 file_size = (crypt_stat->metadata_size
339 + i_size_read(lower_dentry->d_inode)); 332 + i_size_read(lower_dentry->d_inode));
340 else 333 else
341 file_size = i_size_read(lower_dentry->d_inode); 334 file_size = i_size_read(lower_dentry->d_inode);
@@ -387,9 +380,9 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
387 mutex_unlock(&lower_dir_dentry->d_inode->i_mutex); 380 mutex_unlock(&lower_dir_dentry->d_inode->i_mutex);
388 if (IS_ERR(lower_dentry)) { 381 if (IS_ERR(lower_dentry)) {
389 rc = PTR_ERR(lower_dentry); 382 rc = PTR_ERR(lower_dentry);
390 printk(KERN_ERR "%s: lookup_one_len() returned [%d] on " 383 ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
391 "lower_dentry = [%s]\n", __func__, rc, 384 "[%d] on lower_dentry = [%s]\n", __func__, rc,
392 ecryptfs_dentry->d_name.name); 385 encrypted_and_encoded_name);
393 goto out_d_drop; 386 goto out_d_drop;
394 } 387 }
395 if (lower_dentry->d_inode) 388 if (lower_dentry->d_inode)
@@ -416,9 +409,9 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
416 mutex_unlock(&lower_dir_dentry->d_inode->i_mutex); 409 mutex_unlock(&lower_dir_dentry->d_inode->i_mutex);
417 if (IS_ERR(lower_dentry)) { 410 if (IS_ERR(lower_dentry)) {
418 rc = PTR_ERR(lower_dentry); 411 rc = PTR_ERR(lower_dentry);
419 printk(KERN_ERR "%s: lookup_one_len() returned [%d] on " 412 ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
420 "lower_dentry = [%s]\n", __func__, rc, 413 "[%d] on lower_dentry = [%s]\n", __func__, rc,
421 encrypted_and_encoded_name); 414 encrypted_and_encoded_name);
422 goto out_d_drop; 415 goto out_d_drop;
423 } 416 }
424lookup_and_interpose: 417lookup_and_interpose:
@@ -455,8 +448,8 @@ static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir,
455 rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb, 0); 448 rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb, 0);
456 if (rc) 449 if (rc)
457 goto out_lock; 450 goto out_lock;
458 fsstack_copy_attr_times(dir, lower_new_dentry->d_inode); 451 fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode);
459 fsstack_copy_inode_size(dir, lower_new_dentry->d_inode); 452 fsstack_copy_inode_size(dir, lower_dir_dentry->d_inode);
460 old_dentry->d_inode->i_nlink = 453 old_dentry->d_inode->i_nlink =
461 ecryptfs_inode_to_lower(old_dentry->d_inode)->i_nlink; 454 ecryptfs_inode_to_lower(old_dentry->d_inode)->i_nlink;
462 i_size_write(new_dentry->d_inode, file_size_save); 455 i_size_write(new_dentry->d_inode, file_size_save);
@@ -647,38 +640,17 @@ out_lock:
647 return rc; 640 return rc;
648} 641}
649 642
650static int 643static int ecryptfs_readlink_lower(struct dentry *dentry, char **buf,
651ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz) 644 size_t *bufsiz)
652{ 645{
646 struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
653 char *lower_buf; 647 char *lower_buf;
654 size_t lower_bufsiz; 648 size_t lower_bufsiz = PATH_MAX;
655 struct dentry *lower_dentry;
656 struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
657 char *plaintext_name;
658 size_t plaintext_name_size;
659 mm_segment_t old_fs; 649 mm_segment_t old_fs;
660 int rc; 650 int rc;
661 651
662 lower_dentry = ecryptfs_dentry_to_lower(dentry);
663 if (!lower_dentry->d_inode->i_op->readlink) {
664 rc = -EINVAL;
665 goto out;
666 }
667 mount_crypt_stat = &ecryptfs_superblock_to_private(
668 dentry->d_sb)->mount_crypt_stat;
669 /*
670 * If the lower filename is encrypted, it will result in a significantly
671 * longer name. If needed, truncate the name after decode and decrypt.
672 */
673 if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)
674 lower_bufsiz = PATH_MAX;
675 else
676 lower_bufsiz = bufsiz;
677 /* Released in this function */
678 lower_buf = kmalloc(lower_bufsiz, GFP_KERNEL); 652 lower_buf = kmalloc(lower_bufsiz, GFP_KERNEL);
679 if (lower_buf == NULL) { 653 if (!lower_buf) {
680 printk(KERN_ERR "%s: Out of memory whilst attempting to "
681 "kmalloc [%zd] bytes\n", __func__, lower_bufsiz);
682 rc = -ENOMEM; 654 rc = -ENOMEM;
683 goto out; 655 goto out;
684 } 656 }
@@ -688,29 +660,31 @@ ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
688 (char __user *)lower_buf, 660 (char __user *)lower_buf,
689 lower_bufsiz); 661 lower_bufsiz);
690 set_fs(old_fs); 662 set_fs(old_fs);
691 if (rc >= 0) { 663 if (rc < 0)
692 rc = ecryptfs_decode_and_decrypt_filename(&plaintext_name, 664 goto out;
693 &plaintext_name_size, 665 lower_bufsiz = rc;
694 dentry, lower_buf, 666 rc = ecryptfs_decode_and_decrypt_filename(buf, bufsiz, dentry,
695 rc); 667 lower_buf, lower_bufsiz);
696 if (rc) { 668out:
697 printk(KERN_ERR "%s: Error attempting to decode and "
698 "decrypt filename; rc = [%d]\n", __func__,
699 rc);
700 goto out_free_lower_buf;
701 }
702 /* Check for bufsiz <= 0 done in sys_readlinkat() */
703 rc = copy_to_user(buf, plaintext_name,
704 min((size_t) bufsiz, plaintext_name_size));
705 if (rc)
706 rc = -EFAULT;
707 else
708 rc = plaintext_name_size;
709 kfree(plaintext_name);
710 fsstack_copy_attr_atime(dentry->d_inode, lower_dentry->d_inode);
711 }
712out_free_lower_buf:
713 kfree(lower_buf); 669 kfree(lower_buf);
670 return rc;
671}
672
673static int
674ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
675{
676 char *kbuf;
677 size_t kbufsiz, copied;
678 int rc;
679
680 rc = ecryptfs_readlink_lower(dentry, &kbuf, &kbufsiz);
681 if (rc)
682 goto out;
683 copied = min_t(size_t, bufsiz, kbufsiz);
684 rc = copy_to_user(buf, kbuf, copied) ? -EFAULT : copied;
685 kfree(kbuf);
686 fsstack_copy_attr_atime(dentry->d_inode,
687 ecryptfs_dentry_to_lower(dentry)->d_inode);
714out: 688out:
715 return rc; 689 return rc;
716} 690}
@@ -768,7 +742,7 @@ upper_size_to_lower_size(struct ecryptfs_crypt_stat *crypt_stat,
768{ 742{
769 loff_t lower_size; 743 loff_t lower_size;
770 744
771 lower_size = crypt_stat->num_header_bytes_at_front; 745 lower_size = ecryptfs_lower_header_size(crypt_stat);
772 if (upper_size != 0) { 746 if (upper_size != 0) {
773 loff_t num_extents; 747 loff_t num_extents;
774 748
@@ -801,8 +775,6 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
801{ 775{
802 int rc = 0; 776 int rc = 0;
803 struct inode *inode = dentry->d_inode; 777 struct inode *inode = dentry->d_inode;
804 struct dentry *lower_dentry;
805 struct file fake_ecryptfs_file;
806 struct ecryptfs_crypt_stat *crypt_stat; 778 struct ecryptfs_crypt_stat *crypt_stat;
807 loff_t i_size = i_size_read(inode); 779 loff_t i_size = i_size_read(inode);
808 loff_t lower_size_before_truncate; 780 loff_t lower_size_before_truncate;
@@ -813,23 +785,6 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
813 goto out; 785 goto out;
814 } 786 }
815 crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat; 787 crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat;
816 /* Set up a fake ecryptfs file, this is used to interface with
817 * the file in the underlying filesystem so that the
818 * truncation has an effect there as well. */
819 memset(&fake_ecryptfs_file, 0, sizeof(fake_ecryptfs_file));
820 fake_ecryptfs_file.f_path.dentry = dentry;
821 /* Released at out_free: label */
822 ecryptfs_set_file_private(&fake_ecryptfs_file,
823 kmem_cache_alloc(ecryptfs_file_info_cache,
824 GFP_KERNEL));
825 if (unlikely(!ecryptfs_file_to_private(&fake_ecryptfs_file))) {
826 rc = -ENOMEM;
827 goto out;
828 }
829 lower_dentry = ecryptfs_dentry_to_lower(dentry);
830 ecryptfs_set_file_lower(
831 &fake_ecryptfs_file,
832 ecryptfs_inode_to_private(dentry->d_inode)->lower_file);
833 /* Switch on growing or shrinking file */ 788 /* Switch on growing or shrinking file */
834 if (ia->ia_size > i_size) { 789 if (ia->ia_size > i_size) {
835 char zero[] = { 0x00 }; 790 char zero[] = { 0x00 };
@@ -839,7 +794,7 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
839 * this triggers code that will fill in 0's throughout 794 * this triggers code that will fill in 0's throughout
840 * the intermediate portion of the previous end of the 795 * the intermediate portion of the previous end of the
841 * file and the new and of the file */ 796 * file and the new and of the file */
842 rc = ecryptfs_write(&fake_ecryptfs_file, zero, 797 rc = ecryptfs_write(inode, zero,
843 (ia->ia_size - 1), 1); 798 (ia->ia_size - 1), 1);
844 } else { /* ia->ia_size < i_size_read(inode) */ 799 } else { /* ia->ia_size < i_size_read(inode) */
845 /* We're chopping off all the pages down to the page 800 /* We're chopping off all the pages down to the page
@@ -850,12 +805,12 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
850 - (ia->ia_size & ~PAGE_CACHE_MASK)); 805 - (ia->ia_size & ~PAGE_CACHE_MASK));
851 806
852 if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { 807 if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
853 rc = vmtruncate(inode, ia->ia_size); 808 rc = simple_setsize(inode, ia->ia_size);
854 if (rc) 809 if (rc)
855 goto out_free; 810 goto out;
856 lower_ia->ia_size = ia->ia_size; 811 lower_ia->ia_size = ia->ia_size;
857 lower_ia->ia_valid |= ATTR_SIZE; 812 lower_ia->ia_valid |= ATTR_SIZE;
858 goto out_free; 813 goto out;
859 } 814 }
860 if (num_zeros) { 815 if (num_zeros) {
861 char *zeros_virt; 816 char *zeros_virt;
@@ -863,25 +818,25 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
863 zeros_virt = kzalloc(num_zeros, GFP_KERNEL); 818 zeros_virt = kzalloc(num_zeros, GFP_KERNEL);
864 if (!zeros_virt) { 819 if (!zeros_virt) {
865 rc = -ENOMEM; 820 rc = -ENOMEM;
866 goto out_free; 821 goto out;
867 } 822 }
868 rc = ecryptfs_write(&fake_ecryptfs_file, zeros_virt, 823 rc = ecryptfs_write(inode, zeros_virt,
869 ia->ia_size, num_zeros); 824 ia->ia_size, num_zeros);
870 kfree(zeros_virt); 825 kfree(zeros_virt);
871 if (rc) { 826 if (rc) {
872 printk(KERN_ERR "Error attempting to zero out " 827 printk(KERN_ERR "Error attempting to zero out "
873 "the remainder of the end page on " 828 "the remainder of the end page on "
874 "reducing truncate; rc = [%d]\n", rc); 829 "reducing truncate; rc = [%d]\n", rc);
875 goto out_free; 830 goto out;
876 } 831 }
877 } 832 }
878 vmtruncate(inode, ia->ia_size); 833 simple_setsize(inode, ia->ia_size);
879 rc = ecryptfs_write_inode_size_to_metadata(inode); 834 rc = ecryptfs_write_inode_size_to_metadata(inode);
880 if (rc) { 835 if (rc) {
881 printk(KERN_ERR "Problem with " 836 printk(KERN_ERR "Problem with "
882 "ecryptfs_write_inode_size_to_metadata; " 837 "ecryptfs_write_inode_size_to_metadata; "
883 "rc = [%d]\n", rc); 838 "rc = [%d]\n", rc);
884 goto out_free; 839 goto out;
885 } 840 }
886 /* We are reducing the size of the ecryptfs file, and need to 841 /* We are reducing the size of the ecryptfs file, and need to
887 * know if we need to reduce the size of the lower file. */ 842 * know if we need to reduce the size of the lower file. */
@@ -895,10 +850,6 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
895 } else 850 } else
896 lower_ia->ia_valid &= ~ATTR_SIZE; 851 lower_ia->ia_valid &= ~ATTR_SIZE;
897 } 852 }
898out_free:
899 if (ecryptfs_file_to_private(&fake_ecryptfs_file))
900 kmem_cache_free(ecryptfs_file_info_cache,
901 ecryptfs_file_to_private(&fake_ecryptfs_file));
902out: 853out:
903 return rc; 854 return rc;
904} 855}
@@ -1015,6 +966,28 @@ out:
1015 return rc; 966 return rc;
1016} 967}
1017 968
969int ecryptfs_getattr_link(struct vfsmount *mnt, struct dentry *dentry,
970 struct kstat *stat)
971{
972 struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
973 int rc = 0;
974
975 mount_crypt_stat = &ecryptfs_superblock_to_private(
976 dentry->d_sb)->mount_crypt_stat;
977 generic_fillattr(dentry->d_inode, stat);
978 if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) {
979 char *target;
980 size_t targetsiz;
981
982 rc = ecryptfs_readlink_lower(dentry, &target, &targetsiz);
983 if (!rc) {
984 kfree(target);
985 stat->size = targetsiz;
986 }
987 }
988 return rc;
989}
990
1018int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry, 991int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
1019 struct kstat *stat) 992 struct kstat *stat)
1020{ 993{
@@ -1039,7 +1012,7 @@ ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
1039 1012
1040 lower_dentry = ecryptfs_dentry_to_lower(dentry); 1013 lower_dentry = ecryptfs_dentry_to_lower(dentry);
1041 if (!lower_dentry->d_inode->i_op->setxattr) { 1014 if (!lower_dentry->d_inode->i_op->setxattr) {
1042 rc = -ENOSYS; 1015 rc = -EOPNOTSUPP;
1043 goto out; 1016 goto out;
1044 } 1017 }
1045 mutex_lock(&lower_dentry->d_inode->i_mutex); 1018 mutex_lock(&lower_dentry->d_inode->i_mutex);
@@ -1057,7 +1030,7 @@ ecryptfs_getxattr_lower(struct dentry *lower_dentry, const char *name,
1057 int rc = 0; 1030 int rc = 0;
1058 1031
1059 if (!lower_dentry->d_inode->i_op->getxattr) { 1032 if (!lower_dentry->d_inode->i_op->getxattr) {
1060 rc = -ENOSYS; 1033 rc = -EOPNOTSUPP;
1061 goto out; 1034 goto out;
1062 } 1035 }
1063 mutex_lock(&lower_dentry->d_inode->i_mutex); 1036 mutex_lock(&lower_dentry->d_inode->i_mutex);
@@ -1084,7 +1057,7 @@ ecryptfs_listxattr(struct dentry *dentry, char *list, size_t size)
1084 1057
1085 lower_dentry = ecryptfs_dentry_to_lower(dentry); 1058 lower_dentry = ecryptfs_dentry_to_lower(dentry);
1086 if (!lower_dentry->d_inode->i_op->listxattr) { 1059 if (!lower_dentry->d_inode->i_op->listxattr) {
1087 rc = -ENOSYS; 1060 rc = -EOPNOTSUPP;
1088 goto out; 1061 goto out;
1089 } 1062 }
1090 mutex_lock(&lower_dentry->d_inode->i_mutex); 1063 mutex_lock(&lower_dentry->d_inode->i_mutex);
@@ -1101,7 +1074,7 @@ static int ecryptfs_removexattr(struct dentry *dentry, const char *name)
1101 1074
1102 lower_dentry = ecryptfs_dentry_to_lower(dentry); 1075 lower_dentry = ecryptfs_dentry_to_lower(dentry);
1103 if (!lower_dentry->d_inode->i_op->removexattr) { 1076 if (!lower_dentry->d_inode->i_op->removexattr) {
1104 rc = -ENOSYS; 1077 rc = -EOPNOTSUPP;
1105 goto out; 1078 goto out;
1106 } 1079 }
1107 mutex_lock(&lower_dentry->d_inode->i_mutex); 1080 mutex_lock(&lower_dentry->d_inode->i_mutex);
@@ -1132,6 +1105,7 @@ const struct inode_operations ecryptfs_symlink_iops = {
1132 .put_link = ecryptfs_put_link, 1105 .put_link = ecryptfs_put_link,
1133 .permission = ecryptfs_permission, 1106 .permission = ecryptfs_permission,
1134 .setattr = ecryptfs_setattr, 1107 .setattr = ecryptfs_setattr,
1108 .getattr = ecryptfs_getattr_link,
1135 .setxattr = ecryptfs_setxattr, 1109 .setxattr = ecryptfs_setxattr,
1136 .getxattr = ecryptfs_getxattr, 1110 .getxattr = ecryptfs_getxattr,
1137 .listxattr = ecryptfs_listxattr, 1111 .listxattr = ecryptfs_listxattr,
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index a0a7847567e9..89c5476506ef 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -32,6 +32,7 @@
32#include <linux/random.h> 32#include <linux/random.h>
33#include <linux/crypto.h> 33#include <linux/crypto.h>
34#include <linux/scatterlist.h> 34#include <linux/scatterlist.h>
35#include <linux/slab.h>
35#include "ecryptfs_kernel.h" 36#include "ecryptfs_kernel.h"
36 37
37/** 38/**
diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c
index e14cf7e588db..d8c3a373aafa 100644
--- a/fs/ecryptfs/kthread.c
+++ b/fs/ecryptfs/kthread.c
@@ -22,6 +22,7 @@
22 22
23#include <linux/kthread.h> 23#include <linux/kthread.h>
24#include <linux/freezer.h> 24#include <linux/freezer.h>
25#include <linux/slab.h>
25#include <linux/wait.h> 26#include <linux/wait.h>
26#include <linux/mount.h> 27#include <linux/mount.h>
27#include "ecryptfs_kernel.h" 28#include "ecryptfs_kernel.h"
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index ea2f92101dfe..cbd4e18adb20 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -35,6 +35,7 @@
35#include <linux/key.h> 35#include <linux/key.h>
36#include <linux/parser.h> 36#include <linux/parser.h>
37#include <linux/fs_stack.h> 37#include <linux/fs_stack.h>
38#include <linux/slab.h>
38#include "ecryptfs_kernel.h" 39#include "ecryptfs_kernel.h"
39 40
40/** 41/**
@@ -280,7 +281,7 @@ static void ecryptfs_init_mount_crypt_stat(
280 * 281 *
281 * Returns zero on success; non-zero on error 282 * Returns zero on success; non-zero on error
282 */ 283 */
283static int ecryptfs_parse_options(struct super_block *sb, char *options) 284static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options)
284{ 285{
285 char *p; 286 char *p;
286 int rc = 0; 287 int rc = 0;
@@ -292,7 +293,7 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
292 int fn_cipher_key_bytes; 293 int fn_cipher_key_bytes;
293 int fn_cipher_key_bytes_set = 0; 294 int fn_cipher_key_bytes_set = 0;
294 struct ecryptfs_mount_crypt_stat *mount_crypt_stat = 295 struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
295 &ecryptfs_superblock_to_private(sb)->mount_crypt_stat; 296 &sbi->mount_crypt_stat;
296 substring_t args[MAX_OPT_ARGS]; 297 substring_t args[MAX_OPT_ARGS];
297 int token; 298 int token;
298 char *sig_src; 299 char *sig_src;
@@ -482,60 +483,7 @@ out:
482} 483}
483 484
484struct kmem_cache *ecryptfs_sb_info_cache; 485struct kmem_cache *ecryptfs_sb_info_cache;
485 486static struct file_system_type ecryptfs_fs_type;
486/**
487 * ecryptfs_fill_super
488 * @sb: The ecryptfs super block
489 * @raw_data: The options passed to mount
490 * @silent: Not used but required by function prototype
491 *
492 * Sets up what we can of the sb, rest is done in ecryptfs_read_super
493 *
494 * Returns zero on success; non-zero otherwise
495 */
496static int
497ecryptfs_fill_super(struct super_block *sb, void *raw_data, int silent)
498{
499 int rc = 0;
500
501 /* Released in ecryptfs_put_super() */
502 ecryptfs_set_superblock_private(sb,
503 kmem_cache_zalloc(ecryptfs_sb_info_cache,
504 GFP_KERNEL));
505 if (!ecryptfs_superblock_to_private(sb)) {
506 ecryptfs_printk(KERN_WARNING, "Out of memory\n");
507 rc = -ENOMEM;
508 goto out;
509 }
510 sb->s_op = &ecryptfs_sops;
511 /* Released through deactivate_super(sb) from get_sb_nodev */
512 sb->s_root = d_alloc(NULL, &(const struct qstr) {
513 .hash = 0,.name = "/",.len = 1});
514 if (!sb->s_root) {
515 ecryptfs_printk(KERN_ERR, "d_alloc failed\n");
516 rc = -ENOMEM;
517 goto out;
518 }
519 sb->s_root->d_op = &ecryptfs_dops;
520 sb->s_root->d_sb = sb;
521 sb->s_root->d_parent = sb->s_root;
522 /* Released in d_release when dput(sb->s_root) is called */
523 /* through deactivate_super(sb) from get_sb_nodev() */
524 ecryptfs_set_dentry_private(sb->s_root,
525 kmem_cache_zalloc(ecryptfs_dentry_info_cache,
526 GFP_KERNEL));
527 if (!ecryptfs_dentry_to_private(sb->s_root)) {
528 ecryptfs_printk(KERN_ERR,
529 "dentry_info_cache alloc failed\n");
530 rc = -ENOMEM;
531 goto out;
532 }
533 rc = 0;
534out:
535 /* Should be able to rely on deactivate_super called from
536 * get_sb_nodev */
537 return rc;
538}
539 487
540/** 488/**
541 * ecryptfs_read_super 489 * ecryptfs_read_super
@@ -556,6 +504,13 @@ static int ecryptfs_read_super(struct super_block *sb, const char *dev_name)
556 ecryptfs_printk(KERN_WARNING, "path_lookup() failed\n"); 504 ecryptfs_printk(KERN_WARNING, "path_lookup() failed\n");
557 goto out; 505 goto out;
558 } 506 }
507 if (path.dentry->d_sb->s_type == &ecryptfs_fs_type) {
508 rc = -EINVAL;
509 printk(KERN_ERR "Mount on filesystem of type "
510 "eCryptfs explicitly disallowed due to "
511 "known incompatibilities\n");
512 goto out_free;
513 }
559 ecryptfs_set_superblock_lower(sb, path.dentry->d_sb); 514 ecryptfs_set_superblock_lower(sb, path.dentry->d_sb);
560 sb->s_maxbytes = path.dentry->d_sb->s_maxbytes; 515 sb->s_maxbytes = path.dentry->d_sb->s_maxbytes;
561 sb->s_blocksize = path.dentry->d_sb->s_blocksize; 516 sb->s_blocksize = path.dentry->d_sb->s_blocksize;
@@ -579,11 +534,8 @@ out:
579 * @dev_name: The path to mount over 534 * @dev_name: The path to mount over
580 * @raw_data: The options passed into the kernel 535 * @raw_data: The options passed into the kernel
581 * 536 *
582 * The whole ecryptfs_get_sb process is broken into 4 functions: 537 * The whole ecryptfs_get_sb process is broken into 3 functions:
583 * ecryptfs_parse_options(): handle options passed to ecryptfs, if any 538 * ecryptfs_parse_options(): handle options passed to ecryptfs, if any
584 * ecryptfs_fill_super(): used by get_sb_nodev, fills out the super_block
585 * with as much information as it can before needing
586 * the lower filesystem.
587 * ecryptfs_read_super(): this accesses the lower filesystem and uses 539 * ecryptfs_read_super(): this accesses the lower filesystem and uses
588 * ecryptfs_interpose to perform most of the linking 540 * ecryptfs_interpose to perform most of the linking
589 * ecryptfs_interpose(): links the lower filesystem into ecryptfs (inode.c) 541 * ecryptfs_interpose(): links the lower filesystem into ecryptfs (inode.c)
@@ -592,30 +544,78 @@ static int ecryptfs_get_sb(struct file_system_type *fs_type, int flags,
592 const char *dev_name, void *raw_data, 544 const char *dev_name, void *raw_data,
593 struct vfsmount *mnt) 545 struct vfsmount *mnt)
594{ 546{
547 struct super_block *s;
548 struct ecryptfs_sb_info *sbi;
549 struct ecryptfs_dentry_info *root_info;
550 const char *err = "Getting sb failed";
595 int rc; 551 int rc;
596 struct super_block *sb;
597 552
598 rc = get_sb_nodev(fs_type, flags, raw_data, ecryptfs_fill_super, mnt); 553 sbi = kmem_cache_zalloc(ecryptfs_sb_info_cache, GFP_KERNEL);
599 if (rc < 0) { 554 if (!sbi) {
600 printk(KERN_ERR "Getting sb failed; rc = [%d]\n", rc); 555 rc = -ENOMEM;
601 goto out; 556 goto out;
602 } 557 }
603 sb = mnt->mnt_sb; 558
604 rc = ecryptfs_parse_options(sb, raw_data); 559 rc = ecryptfs_parse_options(sbi, raw_data);
605 if (rc) { 560 if (rc) {
606 printk(KERN_ERR "Error parsing options; rc = [%d]\n", rc); 561 err = "Error parsing options";
607 goto out_abort; 562 goto out;
563 }
564
565 s = sget(fs_type, NULL, set_anon_super, NULL);
566 if (IS_ERR(s)) {
567 rc = PTR_ERR(s);
568 goto out;
608 } 569 }
609 rc = ecryptfs_read_super(sb, dev_name); 570
571 s->s_flags = flags;
572 rc = bdi_setup_and_register(&sbi->bdi, "ecryptfs", BDI_CAP_MAP_COPY);
610 if (rc) { 573 if (rc) {
611 printk(KERN_ERR "Reading sb failed; rc = [%d]\n", rc); 574 deactivate_locked_super(s);
612 goto out_abort; 575 goto out;
613 } 576 }
614 goto out; 577
615out_abort: 578 ecryptfs_set_superblock_private(s, sbi);
616 dput(sb->s_root); /* aka mnt->mnt_root, as set by get_sb_nodev() */ 579 s->s_bdi = &sbi->bdi;
617 deactivate_locked_super(sb); 580
581 /* ->kill_sb() will take care of sbi after that point */
582 sbi = NULL;
583 s->s_op = &ecryptfs_sops;
584
585 rc = -ENOMEM;
586 s->s_root = d_alloc(NULL, &(const struct qstr) {
587 .hash = 0,.name = "/",.len = 1});
588 if (!s->s_root) {
589 deactivate_locked_super(s);
590 goto out;
591 }
592 s->s_root->d_op = &ecryptfs_dops;
593 s->s_root->d_sb = s;
594 s->s_root->d_parent = s->s_root;
595
596 root_info = kmem_cache_zalloc(ecryptfs_dentry_info_cache, GFP_KERNEL);
597 if (!root_info) {
598 deactivate_locked_super(s);
599 goto out;
600 }
601 /* ->kill_sb() will take care of root_info */
602 ecryptfs_set_dentry_private(s->s_root, root_info);
603 s->s_flags |= MS_ACTIVE;
604 rc = ecryptfs_read_super(s, dev_name);
605 if (rc) {
606 deactivate_locked_super(s);
607 err = "Reading sb failed";
608 goto out;
609 }
610 simple_set_mnt(mnt, s);
611 return 0;
612
618out: 613out:
614 if (sbi) {
615 ecryptfs_destroy_mount_crypt_stat(&sbi->mount_crypt_stat);
616 kmem_cache_free(ecryptfs_sb_info_cache, sbi);
617 }
618 printk(KERN_ERR "%s; rc = [%d]\n", err, rc);
619 return rc; 619 return rc;
620} 620}
621 621
@@ -624,11 +624,16 @@ out:
624 * @sb: The ecryptfs super block 624 * @sb: The ecryptfs super block
625 * 625 *
626 * Used to bring the superblock down and free the private data. 626 * Used to bring the superblock down and free the private data.
627 * Private data is free'd in ecryptfs_put_super()
628 */ 627 */
629static void ecryptfs_kill_block_super(struct super_block *sb) 628static void ecryptfs_kill_block_super(struct super_block *sb)
630{ 629{
631 generic_shutdown_super(sb); 630 struct ecryptfs_sb_info *sb_info = ecryptfs_superblock_to_private(sb);
631 kill_anon_super(sb);
632 if (!sb_info)
633 return;
634 ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat);
635 bdi_destroy(&sb_info->bdi);
636 kmem_cache_free(ecryptfs_sb_info_cache, sb_info);
632} 637}
633 638
634static struct file_system_type ecryptfs_fs_type = { 639static struct file_system_type ecryptfs_fs_type = {
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index f1c17e87c5fb..2d8dbce9d485 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -20,6 +20,7 @@
20 * 02111-1307, USA. 20 * 02111-1307, USA.
21 */ 21 */
22#include <linux/sched.h> 22#include <linux/sched.h>
23#include <linux/slab.h>
23#include <linux/user_namespace.h> 24#include <linux/user_namespace.h>
24#include <linux/nsproxy.h> 25#include <linux/nsproxy.h>
25#include "ecryptfs_kernel.h" 26#include "ecryptfs_kernel.h"
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index 4ec8f61ccf5a..3745f612bcd4 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -24,6 +24,7 @@
24#include <linux/random.h> 24#include <linux/random.h>
25#include <linux/miscdevice.h> 25#include <linux/miscdevice.h>
26#include <linux/poll.h> 26#include <linux/poll.h>
27#include <linux/slab.h>
27#include <linux/wait.h> 28#include <linux/wait.h>
28#include <linux/module.h> 29#include <linux/module.h>
29#include "ecryptfs_kernel.h" 30#include "ecryptfs_kernel.h"
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index df4ce99d0597..b1d82756544b 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -32,6 +32,7 @@
32#include <linux/file.h> 32#include <linux/file.h>
33#include <linux/crypto.h> 33#include <linux/crypto.h>
34#include <linux/scatterlist.h> 34#include <linux/scatterlist.h>
35#include <linux/slab.h>
35#include <asm/unaligned.h> 36#include <asm/unaligned.h>
36#include "ecryptfs_kernel.h" 37#include "ecryptfs_kernel.h"
37 38
@@ -43,17 +44,9 @@
43 * Returns locked and up-to-date page (if ok), with increased 44 * Returns locked and up-to-date page (if ok), with increased
44 * refcnt. 45 * refcnt.
45 */ 46 */
46struct page *ecryptfs_get_locked_page(struct file *file, loff_t index) 47struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index)
47{ 48{
48 struct dentry *dentry; 49 struct page *page = read_mapping_page(inode->i_mapping, index, NULL);
49 struct inode *inode;
50 struct address_space *mapping;
51 struct page *page;
52
53 dentry = file->f_path.dentry;
54 inode = dentry->d_inode;
55 mapping = inode->i_mapping;
56 page = read_mapping_page(mapping, index, (void *)file);
57 if (!IS_ERR(page)) 50 if (!IS_ERR(page))
58 lock_page(page); 51 lock_page(page);
59 return page; 52 return page;
@@ -82,6 +75,19 @@ out:
82 return rc; 75 return rc;
83} 76}
84 77
78static void strip_xattr_flag(char *page_virt,
79 struct ecryptfs_crypt_stat *crypt_stat)
80{
81 if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) {
82 size_t written;
83
84 crypt_stat->flags &= ~ECRYPTFS_METADATA_IN_XATTR;
85 ecryptfs_write_crypt_stat_flags(page_virt, crypt_stat,
86 &written);
87 crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR;
88 }
89}
90
85/** 91/**
86 * Header Extent: 92 * Header Extent:
87 * Octets 0-7: Unencrypted file size (big-endian) 93 * Octets 0-7: Unencrypted file size (big-endian)
@@ -97,19 +103,6 @@ out:
97 * (big-endian) 103 * (big-endian)
98 * Octet 26: Begin RFC 2440 authentication token packet set 104 * Octet 26: Begin RFC 2440 authentication token packet set
99 */ 105 */
100static void set_header_info(char *page_virt,
101 struct ecryptfs_crypt_stat *crypt_stat)
102{
103 size_t written;
104 size_t save_num_header_bytes_at_front =
105 crypt_stat->num_header_bytes_at_front;
106
107 crypt_stat->num_header_bytes_at_front =
108 ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE;
109 ecryptfs_write_header_metadata(page_virt + 20, crypt_stat, &written);
110 crypt_stat->num_header_bytes_at_front =
111 save_num_header_bytes_at_front;
112}
113 106
114/** 107/**
115 * ecryptfs_copy_up_encrypted_with_header 108 * ecryptfs_copy_up_encrypted_with_header
@@ -135,8 +128,7 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page,
135 * num_extents_per_page) 128 * num_extents_per_page)
136 + extent_num_in_page); 129 + extent_num_in_page);
137 size_t num_header_extents_at_front = 130 size_t num_header_extents_at_front =
138 (crypt_stat->num_header_bytes_at_front 131 (crypt_stat->metadata_size / crypt_stat->extent_size);
139 / crypt_stat->extent_size);
140 132
141 if (view_extent_num < num_header_extents_at_front) { 133 if (view_extent_num < num_header_extents_at_front) {
142 /* This is a header extent */ 134 /* This is a header extent */
@@ -146,9 +138,14 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page,
146 memset(page_virt, 0, PAGE_CACHE_SIZE); 138 memset(page_virt, 0, PAGE_CACHE_SIZE);
147 /* TODO: Support more than one header extent */ 139 /* TODO: Support more than one header extent */
148 if (view_extent_num == 0) { 140 if (view_extent_num == 0) {
141 size_t written;
142
149 rc = ecryptfs_read_xattr_region( 143 rc = ecryptfs_read_xattr_region(
150 page_virt, page->mapping->host); 144 page_virt, page->mapping->host);
151 set_header_info(page_virt, crypt_stat); 145 strip_xattr_flag(page_virt + 16, crypt_stat);
146 ecryptfs_write_header_metadata(page_virt + 20,
147 crypt_stat,
148 &written);
152 } 149 }
153 kunmap_atomic(page_virt, KM_USER0); 150 kunmap_atomic(page_virt, KM_USER0);
154 flush_dcache_page(page); 151 flush_dcache_page(page);
@@ -161,7 +158,7 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page,
161 /* This is an encrypted data extent */ 158 /* This is an encrypted data extent */
162 loff_t lower_offset = 159 loff_t lower_offset =
163 ((view_extent_num * crypt_stat->extent_size) 160 ((view_extent_num * crypt_stat->extent_size)
164 - crypt_stat->num_header_bytes_at_front); 161 - crypt_stat->metadata_size);
165 162
166 rc = ecryptfs_read_lower_page_segment( 163 rc = ecryptfs_read_lower_page_segment(
167 page, (lower_offset >> PAGE_CACHE_SHIFT), 164 page, (lower_offset >> PAGE_CACHE_SHIFT),
@@ -193,7 +190,7 @@ out:
193static int ecryptfs_readpage(struct file *file, struct page *page) 190static int ecryptfs_readpage(struct file *file, struct page *page)
194{ 191{
195 struct ecryptfs_crypt_stat *crypt_stat = 192 struct ecryptfs_crypt_stat *crypt_stat =
196 &ecryptfs_inode_to_private(file->f_path.dentry->d_inode)->crypt_stat; 193 &ecryptfs_inode_to_private(page->mapping->host)->crypt_stat;
197 int rc = 0; 194 int rc = 0;
198 195
199 if (!crypt_stat 196 if (!crypt_stat
@@ -295,8 +292,7 @@ static int ecryptfs_write_begin(struct file *file,
295 292
296 if (!PageUptodate(page)) { 293 if (!PageUptodate(page)) {
297 struct ecryptfs_crypt_stat *crypt_stat = 294 struct ecryptfs_crypt_stat *crypt_stat =
298 &ecryptfs_inode_to_private( 295 &ecryptfs_inode_to_private(mapping->host)->crypt_stat;
299 file->f_path.dentry->d_inode)->crypt_stat;
300 296
301 if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED) 297 if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)
302 || (crypt_stat->flags & ECRYPTFS_NEW_FILE)) { 298 || (crypt_stat->flags & ECRYPTFS_NEW_FILE)) {
@@ -482,7 +478,7 @@ static int ecryptfs_write_end(struct file *file,
482 unsigned to = from + copied; 478 unsigned to = from + copied;
483 struct inode *ecryptfs_inode = mapping->host; 479 struct inode *ecryptfs_inode = mapping->host;
484 struct ecryptfs_crypt_stat *crypt_stat = 480 struct ecryptfs_crypt_stat *crypt_stat =
485 &ecryptfs_inode_to_private(file->f_path.dentry->d_inode)->crypt_stat; 481 &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat;
486 int rc; 482 int rc;
487 483
488 if (crypt_stat->flags & ECRYPTFS_NEW_FILE) { 484 if (crypt_stat->flags & ECRYPTFS_NEW_FILE) {
diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index 0cc4fafd6552..db184ef15d3d 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -93,7 +93,7 @@ int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode,
93 93
94/** 94/**
95 * ecryptfs_write 95 * ecryptfs_write
96 * @ecryptfs_file: The eCryptfs file into which to write 96 * @ecryptfs_inode: The eCryptfs file into which to write
97 * @data: Virtual address where data to write is located 97 * @data: Virtual address where data to write is located
98 * @offset: Offset in the eCryptfs file at which to begin writing the 98 * @offset: Offset in the eCryptfs file at which to begin writing the
99 * data from @data 99 * data from @data
@@ -109,12 +109,11 @@ int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode,
109 * 109 *
110 * Returns zero on success; non-zero otherwise 110 * Returns zero on success; non-zero otherwise
111 */ 111 */
112int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset, 112int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,
113 size_t size) 113 size_t size)
114{ 114{
115 struct page *ecryptfs_page; 115 struct page *ecryptfs_page;
116 struct ecryptfs_crypt_stat *crypt_stat; 116 struct ecryptfs_crypt_stat *crypt_stat;
117 struct inode *ecryptfs_inode = ecryptfs_file->f_dentry->d_inode;
118 char *ecryptfs_page_virt; 117 char *ecryptfs_page_virt;
119 loff_t ecryptfs_file_size = i_size_read(ecryptfs_inode); 118 loff_t ecryptfs_file_size = i_size_read(ecryptfs_inode);
120 loff_t data_offset = 0; 119 loff_t data_offset = 0;
@@ -145,7 +144,7 @@ int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset,
145 if (num_bytes > total_remaining_zeros) 144 if (num_bytes > total_remaining_zeros)
146 num_bytes = total_remaining_zeros; 145 num_bytes = total_remaining_zeros;
147 } 146 }
148 ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_file, 147 ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_inode,
149 ecryptfs_page_idx); 148 ecryptfs_page_idx);
150 if (IS_ERR(ecryptfs_page)) { 149 if (IS_ERR(ecryptfs_page)) {
151 rc = PTR_ERR(ecryptfs_page); 150 rc = PTR_ERR(ecryptfs_page);
@@ -302,10 +301,10 @@ int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs,
302int ecryptfs_read(char *data, loff_t offset, size_t size, 301int ecryptfs_read(char *data, loff_t offset, size_t size,
303 struct file *ecryptfs_file) 302 struct file *ecryptfs_file)
304{ 303{
304 struct inode *ecryptfs_inode = ecryptfs_file->f_dentry->d_inode;
305 struct page *ecryptfs_page; 305 struct page *ecryptfs_page;
306 char *ecryptfs_page_virt; 306 char *ecryptfs_page_virt;
307 loff_t ecryptfs_file_size = 307 loff_t ecryptfs_file_size = i_size_read(ecryptfs_inode);
308 i_size_read(ecryptfs_file->f_dentry->d_inode);
309 loff_t data_offset = 0; 308 loff_t data_offset = 0;
310 loff_t pos; 309 loff_t pos;
311 int rc = 0; 310 int rc = 0;
@@ -327,7 +326,7 @@ int ecryptfs_read(char *data, loff_t offset, size_t size,
327 326
328 if (num_bytes > total_remaining_bytes) 327 if (num_bytes > total_remaining_bytes)
329 num_bytes = total_remaining_bytes; 328 num_bytes = total_remaining_bytes;
330 ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_file, 329 ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_inode,
331 ecryptfs_page_idx); 330 ecryptfs_page_idx);
332 if (IS_ERR(ecryptfs_page)) { 331 if (IS_ERR(ecryptfs_page)) {
333 rc = PTR_ERR(ecryptfs_page); 332 rc = PTR_ERR(ecryptfs_page);
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index b15a43a80ab7..0435886e4a9f 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -26,6 +26,7 @@
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/mount.h> 27#include <linux/mount.h>
28#include <linux/key.h> 28#include <linux/key.h>
29#include <linux/slab.h>
29#include <linux/seq_file.h> 30#include <linux/seq_file.h>
30#include <linux/smp_lock.h> 31#include <linux/smp_lock.h>
31#include <linux/file.h> 32#include <linux/file.h>
@@ -85,7 +86,6 @@ static void ecryptfs_destroy_inode(struct inode *inode)
85 if (lower_dentry->d_inode) { 86 if (lower_dentry->d_inode) {
86 fput(inode_info->lower_file); 87 fput(inode_info->lower_file);
87 inode_info->lower_file = NULL; 88 inode_info->lower_file = NULL;
88 d_drop(lower_dentry);
89 } 89 }
90 } 90 }
91 ecryptfs_destroy_crypt_stat(&inode_info->crypt_stat); 91 ecryptfs_destroy_crypt_stat(&inode_info->crypt_stat);
@@ -109,26 +109,6 @@ void ecryptfs_init_inode(struct inode *inode, struct inode *lower_inode)
109} 109}
110 110
111/** 111/**
112 * ecryptfs_put_super
113 * @sb: Pointer to the ecryptfs super block
114 *
115 * Final actions when unmounting a file system.
116 * This will handle deallocation and release of our private data.
117 */
118static void ecryptfs_put_super(struct super_block *sb)
119{
120 struct ecryptfs_sb_info *sb_info = ecryptfs_superblock_to_private(sb);
121
122 lock_kernel();
123
124 ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat);
125 kmem_cache_free(ecryptfs_sb_info_cache, sb_info);
126 ecryptfs_set_superblock_private(sb, NULL);
127
128 unlock_kernel();
129}
130
131/**
132 * ecryptfs_statfs 112 * ecryptfs_statfs
133 * @sb: The ecryptfs super block 113 * @sb: The ecryptfs super block
134 * @buf: The struct kstatfs to fill in with stats 114 * @buf: The struct kstatfs to fill in with stats
@@ -202,7 +182,6 @@ const struct super_operations ecryptfs_sops = {
202 .alloc_inode = ecryptfs_alloc_inode, 182 .alloc_inode = ecryptfs_alloc_inode,
203 .destroy_inode = ecryptfs_destroy_inode, 183 .destroy_inode = ecryptfs_destroy_inode,
204 .drop_inode = generic_delete_inode, 184 .drop_inode = generic_delete_inode,
205 .put_super = ecryptfs_put_super,
206 .statfs = ecryptfs_statfs, 185 .statfs = ecryptfs_statfs,
207 .remount_fs = NULL, 186 .remount_fs = NULL,
208 .clear_inode = ecryptfs_clear_inode, 187 .clear_inode = ecryptfs_clear_inode,
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 7758cc382ef0..6bd3f76fdf88 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -11,6 +11,7 @@
11#include <linux/fs.h> 11#include <linux/fs.h>
12#include <linux/sched.h> 12#include <linux/sched.h>
13#include <linux/kernel.h> 13#include <linux/kernel.h>
14#include <linux/slab.h>
14#include <linux/list.h> 15#include <linux/list.h>
15#include <linux/spinlock.h> 16#include <linux/spinlock.h>
16#include <linux/anon_inodes.h> 17#include <linux/anon_inodes.h>
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index bd056a5b4efc..3817149919cb 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1140,8 +1140,7 @@ retry:
1140 * ep_poll_callback() when events will become available. 1140 * ep_poll_callback() when events will become available.
1141 */ 1141 */
1142 init_waitqueue_entry(&wait, current); 1142 init_waitqueue_entry(&wait, current);
1143 wait.flags |= WQ_FLAG_EXCLUSIVE; 1143 __add_wait_queue_exclusive(&ep->wq, &wait);
1144 __add_wait_queue(&ep->wq, &wait);
1145 1144
1146 for (;;) { 1145 for (;;) {
1147 /* 1146 /*
diff --git a/fs/exec.c b/fs/exec.c
index 49cdaa19e5b9..e19de6a80339 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -242,9 +242,10 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
242 * use STACK_TOP because that can depend on attributes which aren't 242 * use STACK_TOP because that can depend on attributes which aren't
243 * configured yet. 243 * configured yet.
244 */ 244 */
245 BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
245 vma->vm_end = STACK_TOP_MAX; 246 vma->vm_end = STACK_TOP_MAX;
246 vma->vm_start = vma->vm_end - PAGE_SIZE; 247 vma->vm_start = vma->vm_end - PAGE_SIZE;
247 vma->vm_flags = VM_STACK_FLAGS; 248 vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
248 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); 249 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
249 INIT_LIST_HEAD(&vma->anon_vma_chain); 250 INIT_LIST_HEAD(&vma->anon_vma_chain);
250 err = insert_vm_struct(mm, vma); 251 err = insert_vm_struct(mm, vma);
@@ -616,6 +617,7 @@ int setup_arg_pages(struct linux_binprm *bprm,
616 else if (executable_stack == EXSTACK_DISABLE_X) 617 else if (executable_stack == EXSTACK_DISABLE_X)
617 vm_flags &= ~VM_EXEC; 618 vm_flags &= ~VM_EXEC;
618 vm_flags |= mm->def_flags; 619 vm_flags |= mm->def_flags;
620 vm_flags |= VM_STACK_INCOMPLETE_SETUP;
619 621
620 ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end, 622 ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end,
621 vm_flags); 623 vm_flags);
@@ -630,6 +632,9 @@ int setup_arg_pages(struct linux_binprm *bprm,
630 goto out_unlock; 632 goto out_unlock;
631 } 633 }
632 634
635 /* mprotect_fixup is overkill to remove the temporary stack flags */
636 vma->vm_flags &= ~VM_STACK_INCOMPLETE_SETUP;
637
633 stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */ 638 stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */
634 stack_size = vma->vm_end - vma->vm_start; 639 stack_size = vma->vm_end - vma->vm_start;
635 /* 640 /*
@@ -763,7 +768,6 @@ static int de_thread(struct task_struct *tsk)
763 struct signal_struct *sig = tsk->signal; 768 struct signal_struct *sig = tsk->signal;
764 struct sighand_struct *oldsighand = tsk->sighand; 769 struct sighand_struct *oldsighand = tsk->sighand;
765 spinlock_t *lock = &oldsighand->siglock; 770 spinlock_t *lock = &oldsighand->siglock;
766 int count;
767 771
768 if (thread_group_empty(tsk)) 772 if (thread_group_empty(tsk))
769 goto no_thread_group; 773 goto no_thread_group;
@@ -780,13 +784,13 @@ static int de_thread(struct task_struct *tsk)
780 spin_unlock_irq(lock); 784 spin_unlock_irq(lock);
781 return -EAGAIN; 785 return -EAGAIN;
782 } 786 }
787
783 sig->group_exit_task = tsk; 788 sig->group_exit_task = tsk;
784 zap_other_threads(tsk); 789 sig->notify_count = zap_other_threads(tsk);
790 if (!thread_group_leader(tsk))
791 sig->notify_count--;
785 792
786 /* Account for the thread group leader hanging around: */ 793 while (sig->notify_count) {
787 count = thread_group_leader(tsk) ? 1 : 2;
788 sig->notify_count = count;
789 while (atomic_read(&sig->count) > count) {
790 __set_current_state(TASK_UNINTERRUPTIBLE); 794 __set_current_state(TASK_UNINTERRUPTIBLE);
791 spin_unlock_irq(lock); 795 spin_unlock_irq(lock);
792 schedule(); 796 schedule();
@@ -1387,8 +1391,6 @@ int do_execve(char * filename,
1387 if (retval < 0) 1391 if (retval < 0)
1388 goto out; 1392 goto out;
1389 1393
1390 current->stack_start = current->mm->start_stack;
1391
1392 /* execve succeeded */ 1394 /* execve succeeded */
1393 current->fs->in_exec = 0; 1395 current->fs->in_exec = 0;
1394 current->in_execve = 0; 1396 current->in_execve = 0;
@@ -1659,12 +1661,15 @@ static int coredump_wait(int exit_code, struct core_state *core_state)
1659 struct task_struct *tsk = current; 1661 struct task_struct *tsk = current;
1660 struct mm_struct *mm = tsk->mm; 1662 struct mm_struct *mm = tsk->mm;
1661 struct completion *vfork_done; 1663 struct completion *vfork_done;
1662 int core_waiters; 1664 int core_waiters = -EBUSY;
1663 1665
1664 init_completion(&core_state->startup); 1666 init_completion(&core_state->startup);
1665 core_state->dumper.task = tsk; 1667 core_state->dumper.task = tsk;
1666 core_state->dumper.next = NULL; 1668 core_state->dumper.next = NULL;
1667 core_waiters = zap_threads(tsk, mm, core_state, exit_code); 1669
1670 down_write(&mm->mmap_sem);
1671 if (!mm->core_state)
1672 core_waiters = zap_threads(tsk, mm, core_state, exit_code);
1668 up_write(&mm->mmap_sem); 1673 up_write(&mm->mmap_sem);
1669 1674
1670 if (unlikely(core_waiters < 0)) 1675 if (unlikely(core_waiters < 0))
@@ -1784,21 +1789,61 @@ static void wait_for_dump_helpers(struct file *file)
1784} 1789}
1785 1790
1786 1791
1792/*
1793 * uhm_pipe_setup
1794 * helper function to customize the process used
1795 * to collect the core in userspace. Specifically
1796 * it sets up a pipe and installs it as fd 0 (stdin)
1797 * for the process. Returns 0 on success, or
1798 * PTR_ERR on failure.
1799 * Note that it also sets the core limit to 1. This
1800 * is a special value that we use to trap recursive
1801 * core dumps
1802 */
1803static int umh_pipe_setup(struct subprocess_info *info)
1804{
1805 struct file *rp, *wp;
1806 struct fdtable *fdt;
1807 struct coredump_params *cp = (struct coredump_params *)info->data;
1808 struct files_struct *cf = current->files;
1809
1810 wp = create_write_pipe(0);
1811 if (IS_ERR(wp))
1812 return PTR_ERR(wp);
1813
1814 rp = create_read_pipe(wp, 0);
1815 if (IS_ERR(rp)) {
1816 free_write_pipe(wp);
1817 return PTR_ERR(rp);
1818 }
1819
1820 cp->file = wp;
1821
1822 sys_close(0);
1823 fd_install(0, rp);
1824 spin_lock(&cf->file_lock);
1825 fdt = files_fdtable(cf);
1826 FD_SET(0, fdt->open_fds);
1827 FD_CLR(0, fdt->close_on_exec);
1828 spin_unlock(&cf->file_lock);
1829
1830 /* and disallow core files too */
1831 current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1};
1832
1833 return 0;
1834}
1835
1787void do_coredump(long signr, int exit_code, struct pt_regs *regs) 1836void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1788{ 1837{
1789 struct core_state core_state; 1838 struct core_state core_state;
1790 char corename[CORENAME_MAX_SIZE + 1]; 1839 char corename[CORENAME_MAX_SIZE + 1];
1791 struct mm_struct *mm = current->mm; 1840 struct mm_struct *mm = current->mm;
1792 struct linux_binfmt * binfmt; 1841 struct linux_binfmt * binfmt;
1793 struct inode * inode;
1794 const struct cred *old_cred; 1842 const struct cred *old_cred;
1795 struct cred *cred; 1843 struct cred *cred;
1796 int retval = 0; 1844 int retval = 0;
1797 int flag = 0; 1845 int flag = 0;
1798 int ispipe = 0; 1846 int ispipe;
1799 char **helper_argv = NULL;
1800 int helper_argc = 0;
1801 int dump_count = 0;
1802 static atomic_t core_dump_count = ATOMIC_INIT(0); 1847 static atomic_t core_dump_count = ATOMIC_INIT(0);
1803 struct coredump_params cprm = { 1848 struct coredump_params cprm = {
1804 .signr = signr, 1849 .signr = signr,
@@ -1817,23 +1862,12 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1817 binfmt = mm->binfmt; 1862 binfmt = mm->binfmt;
1818 if (!binfmt || !binfmt->core_dump) 1863 if (!binfmt || !binfmt->core_dump)
1819 goto fail; 1864 goto fail;
1820 1865 if (!__get_dumpable(cprm.mm_flags))
1821 cred = prepare_creds();
1822 if (!cred) {
1823 retval = -ENOMEM;
1824 goto fail; 1866 goto fail;
1825 }
1826 1867
1827 down_write(&mm->mmap_sem); 1868 cred = prepare_creds();
1828 /* 1869 if (!cred)
1829 * If another thread got here first, or we are not dumpable, bail out.
1830 */
1831 if (mm->core_state || !__get_dumpable(cprm.mm_flags)) {
1832 up_write(&mm->mmap_sem);
1833 put_cred(cred);
1834 goto fail; 1870 goto fail;
1835 }
1836
1837 /* 1871 /*
1838 * We cannot trust fsuid as being the "true" uid of the 1872 * We cannot trust fsuid as being the "true" uid of the
1839 * process nor do we know its entire history. We only know it 1873 * process nor do we know its entire history. We only know it
@@ -1846,10 +1880,8 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1846 } 1880 }
1847 1881
1848 retval = coredump_wait(exit_code, &core_state); 1882 retval = coredump_wait(exit_code, &core_state);
1849 if (retval < 0) { 1883 if (retval < 0)
1850 put_cred(cred); 1884 goto fail_creds;
1851 goto fail;
1852 }
1853 1885
1854 old_cred = override_creds(cred); 1886 old_cred = override_creds(cred);
1855 1887
@@ -1867,19 +1899,19 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1867 ispipe = format_corename(corename, signr); 1899 ispipe = format_corename(corename, signr);
1868 unlock_kernel(); 1900 unlock_kernel();
1869 1901
1870 if ((!ispipe) && (cprm.limit < binfmt->min_coredump))
1871 goto fail_unlock;
1872
1873 if (ispipe) { 1902 if (ispipe) {
1874 if (cprm.limit == 0) { 1903 int dump_count;
1904 char **helper_argv;
1905
1906 if (cprm.limit == 1) {
1875 /* 1907 /*
1876 * Normally core limits are irrelevant to pipes, since 1908 * Normally core limits are irrelevant to pipes, since
1877 * we're not writing to the file system, but we use 1909 * we're not writing to the file system, but we use
1878 * cprm.limit of 0 here as a speacial value. Any 1910 * cprm.limit of 1 here as a speacial value. Any
1879 * non-zero limit gets set to RLIM_INFINITY below, but 1911 * non-1 limit gets set to RLIM_INFINITY below, but
1880 * a limit of 0 skips the dump. This is a consistent 1912 * a limit of 0 skips the dump. This is a consistent
1881 * way to catch recursive crashes. We can still crash 1913 * way to catch recursive crashes. We can still crash
1882 * if the core_pattern binary sets RLIM_CORE = !0 1914 * if the core_pattern binary sets RLIM_CORE = !1
1883 * but it runs as root, and can do lots of stupid things 1915 * but it runs as root, and can do lots of stupid things
1884 * Note that we use task_tgid_vnr here to grab the pid 1916 * Note that we use task_tgid_vnr here to grab the pid
1885 * of the process group leader. That way we get the 1917 * of the process group leader. That way we get the
@@ -1887,11 +1919,12 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1887 * core_pattern process dies. 1919 * core_pattern process dies.
1888 */ 1920 */
1889 printk(KERN_WARNING 1921 printk(KERN_WARNING
1890 "Process %d(%s) has RLIMIT_CORE set to 0\n", 1922 "Process %d(%s) has RLIMIT_CORE set to 1\n",
1891 task_tgid_vnr(current), current->comm); 1923 task_tgid_vnr(current), current->comm);
1892 printk(KERN_WARNING "Aborting core\n"); 1924 printk(KERN_WARNING "Aborting core\n");
1893 goto fail_unlock; 1925 goto fail_unlock;
1894 } 1926 }
1927 cprm.limit = RLIM_INFINITY;
1895 1928
1896 dump_count = atomic_inc_return(&core_dump_count); 1929 dump_count = atomic_inc_return(&core_dump_count);
1897 if (core_pipe_limit && (core_pipe_limit < dump_count)) { 1930 if (core_pipe_limit && (core_pipe_limit < dump_count)) {
@@ -1901,71 +1934,74 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1901 goto fail_dropcount; 1934 goto fail_dropcount;
1902 } 1935 }
1903 1936
1904 helper_argv = argv_split(GFP_KERNEL, corename+1, &helper_argc); 1937 helper_argv = argv_split(GFP_KERNEL, corename+1, NULL);
1905 if (!helper_argv) { 1938 if (!helper_argv) {
1906 printk(KERN_WARNING "%s failed to allocate memory\n", 1939 printk(KERN_WARNING "%s failed to allocate memory\n",
1907 __func__); 1940 __func__);
1908 goto fail_dropcount; 1941 goto fail_dropcount;
1909 } 1942 }
1910 1943
1911 cprm.limit = RLIM_INFINITY; 1944 retval = call_usermodehelper_fns(helper_argv[0], helper_argv,
1912 1945 NULL, UMH_WAIT_EXEC, umh_pipe_setup,
1913 /* SIGPIPE can happen, but it's just never processed */ 1946 NULL, &cprm);
1914 if (call_usermodehelper_pipe(helper_argv[0], helper_argv, NULL, 1947 argv_free(helper_argv);
1915 &cprm.file)) { 1948 if (retval) {
1916 printk(KERN_INFO "Core dump to %s pipe failed\n", 1949 printk(KERN_INFO "Core dump to %s pipe failed\n",
1917 corename); 1950 corename);
1918 goto fail_dropcount; 1951 goto close_fail;
1919 } 1952 }
1920 } else 1953 } else {
1954 struct inode *inode;
1955
1956 if (cprm.limit < binfmt->min_coredump)
1957 goto fail_unlock;
1958
1921 cprm.file = filp_open(corename, 1959 cprm.file = filp_open(corename,
1922 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag, 1960 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
1923 0600); 1961 0600);
1924 if (IS_ERR(cprm.file)) 1962 if (IS_ERR(cprm.file))
1925 goto fail_dropcount; 1963 goto fail_unlock;
1926 inode = cprm.file->f_path.dentry->d_inode;
1927 if (inode->i_nlink > 1)
1928 goto close_fail; /* multiple links - don't dump */
1929 if (!ispipe && d_unhashed(cprm.file->f_path.dentry))
1930 goto close_fail;
1931
1932 /* AK: actually i see no reason to not allow this for named pipes etc.,
1933 but keep the previous behaviour for now. */
1934 if (!ispipe && !S_ISREG(inode->i_mode))
1935 goto close_fail;
1936 /*
1937 * Dont allow local users get cute and trick others to coredump
1938 * into their pre-created files:
1939 * Note, this is not relevant for pipes
1940 */
1941 if (!ispipe && (inode->i_uid != current_fsuid()))
1942 goto close_fail;
1943 if (!cprm.file->f_op)
1944 goto close_fail;
1945 if (!cprm.file->f_op->write)
1946 goto close_fail;
1947 if (!ispipe &&
1948 do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file) != 0)
1949 goto close_fail;
1950 1964
1951 retval = binfmt->core_dump(&cprm); 1965 inode = cprm.file->f_path.dentry->d_inode;
1966 if (inode->i_nlink > 1)
1967 goto close_fail;
1968 if (d_unhashed(cprm.file->f_path.dentry))
1969 goto close_fail;
1970 /*
1971 * AK: actually i see no reason to not allow this for named
1972 * pipes etc, but keep the previous behaviour for now.
1973 */
1974 if (!S_ISREG(inode->i_mode))
1975 goto close_fail;
1976 /*
1977 * Dont allow local users get cute and trick others to coredump
1978 * into their pre-created files.
1979 */
1980 if (inode->i_uid != current_fsuid())
1981 goto close_fail;
1982 if (!cprm.file->f_op || !cprm.file->f_op->write)
1983 goto close_fail;
1984 if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file))
1985 goto close_fail;
1986 }
1952 1987
1988 retval = binfmt->core_dump(&cprm);
1953 if (retval) 1989 if (retval)
1954 current->signal->group_exit_code |= 0x80; 1990 current->signal->group_exit_code |= 0x80;
1955close_fail: 1991
1956 if (ispipe && core_pipe_limit) 1992 if (ispipe && core_pipe_limit)
1957 wait_for_dump_helpers(cprm.file); 1993 wait_for_dump_helpers(cprm.file);
1958 filp_close(cprm.file, NULL); 1994close_fail:
1995 if (cprm.file)
1996 filp_close(cprm.file, NULL);
1959fail_dropcount: 1997fail_dropcount:
1960 if (dump_count) 1998 if (ispipe)
1961 atomic_dec(&core_dump_count); 1999 atomic_dec(&core_dump_count);
1962fail_unlock: 2000fail_unlock:
1963 if (helper_argv) 2001 coredump_finish(mm);
1964 argv_free(helper_argv);
1965
1966 revert_creds(old_cred); 2002 revert_creds(old_cred);
2003fail_creds:
1967 put_cred(cred); 2004 put_cred(cred);
1968 coredump_finish(mm);
1969fail: 2005fail:
1970 return; 2006 return;
1971} 2007}
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index 4cfab1cc75c0..d91e9d829bc1 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -608,7 +608,7 @@ int exofs_make_empty(struct inode *inode, struct inode *parent)
608 de->inode_no = cpu_to_le64(parent->i_ino); 608 de->inode_no = cpu_to_le64(parent->i_ino);
609 memcpy(de->name, PARENT_DIR, sizeof(PARENT_DIR)); 609 memcpy(de->name, PARENT_DIR, sizeof(PARENT_DIR));
610 exofs_set_de_type(de, inode); 610 exofs_set_de_type(de, inode);
611 kunmap_atomic(page, KM_USER0); 611 kunmap_atomic(kaddr, KM_USER0);
612 err = exofs_commit_chunk(page, 0, chunk_size); 612 err = exofs_commit_chunk(page, 0, chunk_size);
613fail: 613fail:
614 page_cache_release(page); 614 page_cache_release(page);
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index 8442e353309f..22721b2fd890 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -35,6 +35,7 @@
35 35
36#include <linux/fs.h> 36#include <linux/fs.h>
37#include <linux/time.h> 37#include <linux/time.h>
38#include <linux/backing-dev.h>
38#include "common.h" 39#include "common.h"
39 40
40/* FIXME: Remove once pnfs hits mainline 41/* FIXME: Remove once pnfs hits mainline
@@ -84,6 +85,7 @@ struct exofs_sb_info {
84 u32 s_next_generation; /* next gen # to use */ 85 u32 s_next_generation; /* next gen # to use */
85 atomic_t s_curr_pending; /* number of pending commands */ 86 atomic_t s_curr_pending; /* number of pending commands */
86 uint8_t s_cred[OSD_CAP_LEN]; /* credential for the fscb */ 87 uint8_t s_cred[OSD_CAP_LEN]; /* credential for the fscb */
88 struct backing_dev_info bdi; /* register our bdi with VFS */
87 89
88 struct pnfs_osd_data_map data_map; /* Default raid to use 90 struct pnfs_osd_data_map data_map; /* Default raid to use
89 * FIXME: Needed ? 91 * FIXME: Needed ?
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
index 839b9dc1e70f..fef6899be397 100644
--- a/fs/exofs/file.c
+++ b/fs/exofs/file.c
@@ -40,12 +40,11 @@ static int exofs_release_file(struct inode *inode, struct file *filp)
40 return 0; 40 return 0;
41} 41}
42 42
43static int exofs_file_fsync(struct file *filp, struct dentry *dentry, 43static int exofs_file_fsync(struct file *filp, int datasync)
44 int datasync)
45{ 44{
46 int ret; 45 int ret;
47 struct address_space *mapping = filp->f_mapping; 46 struct address_space *mapping = filp->f_mapping;
48 struct inode *inode = dentry->d_inode; 47 struct inode *inode = mapping->host;
49 struct super_block *sb; 48 struct super_block *sb;
50 49
51 ret = filemap_write_and_wait(mapping); 50 ret = filemap_write_and_wait(mapping);
@@ -66,7 +65,7 @@ static int exofs_file_fsync(struct file *filp, struct dentry *dentry,
66 65
67static int exofs_flush(struct file *file, fl_owner_t id) 66static int exofs_flush(struct file *file, fl_owner_t id)
68{ 67{
69 exofs_file_fsync(file, file->f_path.dentry, 1); 68 exofs_file_fsync(file, 1);
70 /* TODO: Flush the OSD target */ 69 /* TODO: Flush the OSD target */
71 return 0; 70 return 0;
72} 71}
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index a17e4b733e35..4bb6ef822e46 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -31,6 +31,7 @@
31 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 31 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
32 */ 32 */
33 33
34#include <linux/slab.h>
34#include <linux/writeback.h> 35#include <linux/writeback.h>
35#include <linux/buffer_head.h> 36#include <linux/buffer_head.h>
36#include <scsi/scsi_device.h> 37#include <scsi/scsi_device.h>
@@ -754,6 +755,21 @@ static int exofs_write_end(struct file *file, struct address_space *mapping,
754 return ret; 755 return ret;
755} 756}
756 757
758static int exofs_releasepage(struct page *page, gfp_t gfp)
759{
760 EXOFS_DBGMSG("page 0x%lx\n", page->index);
761 WARN_ON(1);
762 return try_to_free_buffers(page);
763}
764
765static void exofs_invalidatepage(struct page *page, unsigned long offset)
766{
767 EXOFS_DBGMSG("page_has_buffers=>%d\n", page_has_buffers(page));
768 WARN_ON(1);
769
770 block_invalidatepage(page, offset);
771}
772
757const struct address_space_operations exofs_aops = { 773const struct address_space_operations exofs_aops = {
758 .readpage = exofs_readpage, 774 .readpage = exofs_readpage,
759 .readpages = exofs_readpages, 775 .readpages = exofs_readpages,
@@ -761,6 +777,21 @@ const struct address_space_operations exofs_aops = {
761 .writepages = exofs_writepages, 777 .writepages = exofs_writepages,
762 .write_begin = exofs_write_begin_export, 778 .write_begin = exofs_write_begin_export,
763 .write_end = exofs_write_end, 779 .write_end = exofs_write_end,
780 .releasepage = exofs_releasepage,
781 .set_page_dirty = __set_page_dirty_nobuffers,
782 .invalidatepage = exofs_invalidatepage,
783
784 /* Not implemented Yet */
785 .bmap = NULL, /* TODO: use osd's OSD_ACT_READ_MAP */
786 .direct_IO = NULL, /* TODO: Should be trivial to do */
787
788 /* With these NULL has special meaning or default is not exported */
789 .sync_page = NULL,
790 .get_xip_mem = NULL,
791 .migratepage = NULL,
792 .launder_page = NULL,
793 .is_partially_uptodate = NULL,
794 .error_remove_page = NULL,
764}; 795};
765 796
766/****************************************************************************** 797/******************************************************************************
@@ -1122,16 +1153,7 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
1122 sbi = sb->s_fs_info; 1153 sbi = sb->s_fs_info;
1123 1154
1124 sb->s_dirt = 1; 1155 sb->s_dirt = 1;
1125 inode->i_uid = current->cred->fsuid; 1156 inode_init_owner(inode, dir, mode);
1126 if (dir->i_mode & S_ISGID) {
1127 inode->i_gid = dir->i_gid;
1128 if (S_ISDIR(mode))
1129 mode |= S_ISGID;
1130 } else {
1131 inode->i_gid = current->cred->fsgid;
1132 }
1133 inode->i_mode = mode;
1134
1135 inode->i_ino = sbi->s_nextid++; 1157 inode->i_ino = sbi->s_nextid++;
1136 inode->i_blkbits = EXOFS_BLKSHIFT; 1158 inode->i_blkbits = EXOFS_BLKSHIFT;
1137 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 1159 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c
index 5293bc411d17..4337cad7777b 100644
--- a/fs/exofs/ios.c
+++ b/fs/exofs/ios.c
@@ -22,6 +22,7 @@
22 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 22 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
23 */ 23 */
24 24
25#include <linux/slab.h>
25#include <scsi/scsi_device.h> 26#include <scsi/scsi_device.h>
26#include <asm/div64.h> 27#include <asm/div64.h>
27 28
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 6cf5e4e84d61..03149b9a5178 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -37,6 +37,7 @@
37#include <linux/vfs.h> 37#include <linux/vfs.h>
38#include <linux/random.h> 38#include <linux/random.h>
39#include <linux/exportfs.h> 39#include <linux/exportfs.h>
40#include <linux/slab.h>
40 41
41#include "exofs.h" 42#include "exofs.h"
42 43
@@ -301,6 +302,7 @@ static void exofs_put_super(struct super_block *sb)
301 _exofs_print_device("Unmounting", NULL, sbi->layout.s_ods[0], 302 _exofs_print_device("Unmounting", NULL, sbi->layout.s_ods[0],
302 sbi->layout.s_pid); 303 sbi->layout.s_pid);
303 304
305 bdi_destroy(&sbi->bdi);
304 exofs_free_sbi(sbi); 306 exofs_free_sbi(sbi);
305 sb->s_fs_info = NULL; 307 sb->s_fs_info = NULL;
306} 308}
@@ -545,6 +547,10 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
545 if (!sbi) 547 if (!sbi)
546 return -ENOMEM; 548 return -ENOMEM;
547 549
550 ret = bdi_setup_and_register(&sbi->bdi, "exofs", BDI_CAP_MAP_COPY);
551 if (ret)
552 goto free_bdi;
553
548 /* use mount options to fill superblock */ 554 /* use mount options to fill superblock */
549 od = osduld_path_lookup(opts->dev_name); 555 od = osduld_path_lookup(opts->dev_name);
550 if (IS_ERR(od)) { 556 if (IS_ERR(od)) {
@@ -611,6 +617,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
611 } 617 }
612 618
613 /* set up operation vectors */ 619 /* set up operation vectors */
620 sb->s_bdi = &sbi->bdi;
614 sb->s_fs_info = sbi; 621 sb->s_fs_info = sbi;
615 sb->s_op = &exofs_sops; 622 sb->s_op = &exofs_sops;
616 sb->s_export_op = &exofs_export_ops; 623 sb->s_export_op = &exofs_export_ops;
@@ -642,6 +649,8 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
642 return 0; 649 return 0;
643 650
644free_sbi: 651free_sbi:
652 bdi_destroy(&sbi->bdi);
653free_bdi:
645 EXOFS_ERR("Unable to mount exofs on %s pid=0x%llx err=%d\n", 654 EXOFS_ERR("Unable to mount exofs on %s pid=0x%llx err=%d\n",
646 opts->dev_name, sbi->layout.s_pid, ret); 655 opts->dev_name, sbi->layout.s_pid, ret);
647 exofs_free_sbi(sbi); 656 exofs_free_sbi(sbi);
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index a99e54318c3d..ca7e2a0ed98a 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -420,7 +420,7 @@ release_and_out:
420 return error; 420 return error;
421} 421}
422 422
423struct xattr_handler ext2_xattr_acl_access_handler = { 423const struct xattr_handler ext2_xattr_acl_access_handler = {
424 .prefix = POSIX_ACL_XATTR_ACCESS, 424 .prefix = POSIX_ACL_XATTR_ACCESS,
425 .flags = ACL_TYPE_ACCESS, 425 .flags = ACL_TYPE_ACCESS,
426 .list = ext2_xattr_list_acl_access, 426 .list = ext2_xattr_list_acl_access,
@@ -428,7 +428,7 @@ struct xattr_handler ext2_xattr_acl_access_handler = {
428 .set = ext2_xattr_set_acl, 428 .set = ext2_xattr_set_acl,
429}; 429};
430 430
431struct xattr_handler ext2_xattr_acl_default_handler = { 431const struct xattr_handler ext2_xattr_acl_default_handler = {
432 .prefix = POSIX_ACL_XATTR_DEFAULT, 432 .prefix = POSIX_ACL_XATTR_DEFAULT,
433 .flags = ACL_TYPE_DEFAULT, 433 .flags = ACL_TYPE_DEFAULT,
434 .list = ext2_xattr_list_acl_default, 434 .list = ext2_xattr_list_acl_default,
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index 1d081f0cfec2..e8766a396776 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -13,6 +13,7 @@
13 13
14#include "ext2.h" 14#include "ext2.h"
15#include <linux/quotaops.h> 15#include <linux/quotaops.h>
16#include <linux/slab.h>
16#include <linux/sched.h> 17#include <linux/sched.h>
17#include <linux/buffer_head.h> 18#include <linux/buffer_head.h>
18#include <linux/capability.h> 19#include <linux/capability.h>
@@ -1331,6 +1332,12 @@ retry_alloc:
1331 1332
1332 free_blocks = le16_to_cpu(gdp->bg_free_blocks_count); 1333 free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
1333 /* 1334 /*
1335 * skip this group (and avoid loading bitmap) if there
1336 * are no free blocks
1337 */
1338 if (!free_blocks)
1339 continue;
1340 /*
1334 * skip this group if the number of 1341 * skip this group if the number of
1335 * free blocks is less than half of the reservation 1342 * free blocks is less than half of the reservation
1336 * window size. 1343 * window size.
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 0b038e47ad2f..52b34f1d2738 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -122,7 +122,6 @@ extern int ext2_write_inode (struct inode *, struct writeback_control *);
122extern void ext2_delete_inode (struct inode *); 122extern void ext2_delete_inode (struct inode *);
123extern int ext2_sync_inode (struct inode *); 123extern int ext2_sync_inode (struct inode *);
124extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int); 124extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int);
125extern void ext2_truncate (struct inode *);
126extern int ext2_setattr (struct dentry *, struct iattr *); 125extern int ext2_setattr (struct dentry *, struct iattr *);
127extern void ext2_set_inode_flags(struct inode *inode); 126extern void ext2_set_inode_flags(struct inode *inode);
128extern void ext2_get_inode_flags(struct ext2_inode_info *); 127extern void ext2_get_inode_flags(struct ext2_inode_info *);
@@ -155,7 +154,7 @@ extern void ext2_write_super (struct super_block *);
155extern const struct file_operations ext2_dir_operations; 154extern const struct file_operations ext2_dir_operations;
156 155
157/* file.c */ 156/* file.c */
158extern int ext2_fsync(struct file *file, struct dentry *dentry, int datasync); 157extern int ext2_fsync(struct file *file, int datasync);
159extern const struct inode_operations ext2_file_inode_operations; 158extern const struct inode_operations ext2_file_inode_operations;
160extern const struct file_operations ext2_file_operations; 159extern const struct file_operations ext2_file_operations;
161extern const struct file_operations ext2_xip_file_operations; 160extern const struct file_operations ext2_xip_file_operations;
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 5d198d0697fb..49eec9456c5b 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -40,13 +40,13 @@ static int ext2_release_file (struct inode * inode, struct file * filp)
40 return 0; 40 return 0;
41} 41}
42 42
43int ext2_fsync(struct file *file, struct dentry *dentry, int datasync) 43int ext2_fsync(struct file *file, int datasync)
44{ 44{
45 int ret; 45 int ret;
46 struct super_block *sb = dentry->d_inode->i_sb; 46 struct super_block *sb = file->f_mapping->host->i_sb;
47 struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping; 47 struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping;
48 48
49 ret = simple_fsync(file, dentry, datasync); 49 ret = generic_file_fsync(file, datasync);
50 if (ret == -EIO || test_and_clear_bit(AS_EIO, &mapping->flags)) { 50 if (ret == -EIO || test_and_clear_bit(AS_EIO, &mapping->flags)) {
51 /* We don't really know where the IO error happened... */ 51 /* We don't really know where the IO error happened... */
52 ext2_error(sb, __func__, 52 ext2_error(sb, __func__,
@@ -95,7 +95,6 @@ const struct file_operations ext2_xip_file_operations = {
95#endif 95#endif
96 96
97const struct inode_operations ext2_file_inode_operations = { 97const struct inode_operations ext2_file_inode_operations = {
98 .truncate = ext2_truncate,
99#ifdef CONFIG_EXT2_FS_XATTR 98#ifdef CONFIG_EXT2_FS_XATTR
100 .setxattr = generic_setxattr, 99 .setxattr = generic_setxattr,
101 .getxattr = generic_getxattr, 100 .getxattr = generic_getxattr,
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index ad7d572ee8dc..938dbc739d00 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -106,7 +106,7 @@ void ext2_free_inode (struct inode * inode)
106 struct super_block * sb = inode->i_sb; 106 struct super_block * sb = inode->i_sb;
107 int is_directory; 107 int is_directory;
108 unsigned long ino; 108 unsigned long ino;
109 struct buffer_head *bitmap_bh = NULL; 109 struct buffer_head *bitmap_bh;
110 unsigned long block_group; 110 unsigned long block_group;
111 unsigned long bit; 111 unsigned long bit;
112 struct ext2_super_block * es; 112 struct ext2_super_block * es;
@@ -135,14 +135,13 @@ void ext2_free_inode (struct inode * inode)
135 ino > le32_to_cpu(es->s_inodes_count)) { 135 ino > le32_to_cpu(es->s_inodes_count)) {
136 ext2_error (sb, "ext2_free_inode", 136 ext2_error (sb, "ext2_free_inode",
137 "reserved or nonexistent inode %lu", ino); 137 "reserved or nonexistent inode %lu", ino);
138 goto error_return; 138 return;
139 } 139 }
140 block_group = (ino - 1) / EXT2_INODES_PER_GROUP(sb); 140 block_group = (ino - 1) / EXT2_INODES_PER_GROUP(sb);
141 bit = (ino - 1) % EXT2_INODES_PER_GROUP(sb); 141 bit = (ino - 1) % EXT2_INODES_PER_GROUP(sb);
142 brelse(bitmap_bh);
143 bitmap_bh = read_inode_bitmap(sb, block_group); 142 bitmap_bh = read_inode_bitmap(sb, block_group);
144 if (!bitmap_bh) 143 if (!bitmap_bh)
145 goto error_return; 144 return;
146 145
147 /* Ok, now we can actually update the inode bitmaps.. */ 146 /* Ok, now we can actually update the inode bitmaps.. */
148 if (!ext2_clear_bit_atomic(sb_bgl_lock(EXT2_SB(sb), block_group), 147 if (!ext2_clear_bit_atomic(sb_bgl_lock(EXT2_SB(sb), block_group),
@@ -154,7 +153,7 @@ void ext2_free_inode (struct inode * inode)
154 mark_buffer_dirty(bitmap_bh); 153 mark_buffer_dirty(bitmap_bh);
155 if (sb->s_flags & MS_SYNCHRONOUS) 154 if (sb->s_flags & MS_SYNCHRONOUS)
156 sync_dirty_buffer(bitmap_bh); 155 sync_dirty_buffer(bitmap_bh);
157error_return: 156
158 brelse(bitmap_bh); 157 brelse(bitmap_bh);
159} 158}
160 159
@@ -550,16 +549,12 @@ got:
550 549
551 sb->s_dirt = 1; 550 sb->s_dirt = 1;
552 mark_buffer_dirty(bh2); 551 mark_buffer_dirty(bh2);
553 inode->i_uid = current_fsuid(); 552 if (test_opt(sb, GRPID)) {
554 if (test_opt (sb, GRPID)) 553 inode->i_mode = mode;
555 inode->i_gid = dir->i_gid; 554 inode->i_uid = current_fsuid();
556 else if (dir->i_mode & S_ISGID) {
557 inode->i_gid = dir->i_gid; 555 inode->i_gid = dir->i_gid;
558 if (S_ISDIR(mode))
559 mode |= S_ISGID;
560 } else 556 } else
561 inode->i_gid = current_fsgid(); 557 inode_init_owner(inode, dir, mode);
562 inode->i_mode = mode;
563 558
564 inode->i_ino = ino; 559 inode->i_ino = ino;
565 inode->i_blocks = 0; 560 inode->i_blocks = 0;
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index fc13cc119aad..19214435b752 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -22,7 +22,6 @@
22 * Assorted race fixes, rewrite of ext2_get_block() by Al Viro, 2000 22 * Assorted race fixes, rewrite of ext2_get_block() by Al Viro, 2000
23 */ 23 */
24 24
25#include <linux/smp_lock.h>
26#include <linux/time.h> 25#include <linux/time.h>
27#include <linux/highuid.h> 26#include <linux/highuid.h>
28#include <linux/pagemap.h> 27#include <linux/pagemap.h>
@@ -55,6 +54,18 @@ static inline int ext2_inode_is_fast_symlink(struct inode *inode)
55 inode->i_blocks - ea_blocks == 0); 54 inode->i_blocks - ea_blocks == 0);
56} 55}
57 56
57static void ext2_truncate_blocks(struct inode *inode, loff_t offset);
58
59static void ext2_write_failed(struct address_space *mapping, loff_t to)
60{
61 struct inode *inode = mapping->host;
62
63 if (to > inode->i_size) {
64 truncate_pagecache(inode, to, inode->i_size);
65 ext2_truncate_blocks(inode, inode->i_size);
66 }
67}
68
58/* 69/*
59 * Called at the last iput() if i_nlink is zero. 70 * Called at the last iput() if i_nlink is zero.
60 */ 71 */
@@ -72,7 +83,7 @@ void ext2_delete_inode (struct inode * inode)
72 83
73 inode->i_size = 0; 84 inode->i_size = 0;
74 if (inode->i_blocks) 85 if (inode->i_blocks)
75 ext2_truncate (inode); 86 ext2_truncate_blocks(inode, 0);
76 ext2_free_inode (inode); 87 ext2_free_inode (inode);
77 88
78 return; 89 return;
@@ -758,8 +769,8 @@ int __ext2_write_begin(struct file *file, struct address_space *mapping,
758 loff_t pos, unsigned len, unsigned flags, 769 loff_t pos, unsigned len, unsigned flags,
759 struct page **pagep, void **fsdata) 770 struct page **pagep, void **fsdata)
760{ 771{
761 return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 772 return block_write_begin_newtrunc(file, mapping, pos, len, flags,
762 ext2_get_block); 773 pagep, fsdata, ext2_get_block);
763} 774}
764 775
765static int 776static int
@@ -767,8 +778,25 @@ ext2_write_begin(struct file *file, struct address_space *mapping,
767 loff_t pos, unsigned len, unsigned flags, 778 loff_t pos, unsigned len, unsigned flags,
768 struct page **pagep, void **fsdata) 779 struct page **pagep, void **fsdata)
769{ 780{
781 int ret;
782
770 *pagep = NULL; 783 *pagep = NULL;
771 return __ext2_write_begin(file, mapping, pos, len, flags, pagep,fsdata); 784 ret = __ext2_write_begin(file, mapping, pos, len, flags, pagep, fsdata);
785 if (ret < 0)
786 ext2_write_failed(mapping, pos + len);
787 return ret;
788}
789
790static int ext2_write_end(struct file *file, struct address_space *mapping,
791 loff_t pos, unsigned len, unsigned copied,
792 struct page *page, void *fsdata)
793{
794 int ret;
795
796 ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
797 if (ret < len)
798 ext2_write_failed(mapping, pos + len);
799 return ret;
772} 800}
773 801
774static int 802static int
@@ -776,13 +804,18 @@ ext2_nobh_write_begin(struct file *file, struct address_space *mapping,
776 loff_t pos, unsigned len, unsigned flags, 804 loff_t pos, unsigned len, unsigned flags,
777 struct page **pagep, void **fsdata) 805 struct page **pagep, void **fsdata)
778{ 806{
807 int ret;
808
779 /* 809 /*
780 * Dir-in-pagecache still uses ext2_write_begin. Would have to rework 810 * Dir-in-pagecache still uses ext2_write_begin. Would have to rework
781 * directory handling code to pass around offsets rather than struct 811 * directory handling code to pass around offsets rather than struct
782 * pages in order to make this work easily. 812 * pages in order to make this work easily.
783 */ 813 */
784 return nobh_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 814 ret = nobh_write_begin_newtrunc(file, mapping, pos, len, flags, pagep,
785 ext2_get_block); 815 fsdata, ext2_get_block);
816 if (ret < 0)
817 ext2_write_failed(mapping, pos + len);
818 return ret;
786} 819}
787 820
788static int ext2_nobh_writepage(struct page *page, 821static int ext2_nobh_writepage(struct page *page,
@@ -801,10 +834,15 @@ ext2_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
801 loff_t offset, unsigned long nr_segs) 834 loff_t offset, unsigned long nr_segs)
802{ 835{
803 struct file *file = iocb->ki_filp; 836 struct file *file = iocb->ki_filp;
804 struct inode *inode = file->f_mapping->host; 837 struct address_space *mapping = file->f_mapping;
805 838 struct inode *inode = mapping->host;
806 return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 839 ssize_t ret;
807 offset, nr_segs, ext2_get_block, NULL); 840
841 ret = blockdev_direct_IO_newtrunc(rw, iocb, inode, inode->i_sb->s_bdev,
842 iov, offset, nr_segs, ext2_get_block, NULL);
843 if (ret < 0 && (rw & WRITE))
844 ext2_write_failed(mapping, offset + iov_length(iov, nr_segs));
845 return ret;
808} 846}
809 847
810static int 848static int
@@ -819,7 +857,7 @@ const struct address_space_operations ext2_aops = {
819 .writepage = ext2_writepage, 857 .writepage = ext2_writepage,
820 .sync_page = block_sync_page, 858 .sync_page = block_sync_page,
821 .write_begin = ext2_write_begin, 859 .write_begin = ext2_write_begin,
822 .write_end = generic_write_end, 860 .write_end = ext2_write_end,
823 .bmap = ext2_bmap, 861 .bmap = ext2_bmap,
824 .direct_IO = ext2_direct_IO, 862 .direct_IO = ext2_direct_IO,
825 .writepages = ext2_writepages, 863 .writepages = ext2_writepages,
@@ -1028,7 +1066,7 @@ static void ext2_free_branches(struct inode *inode, __le32 *p, __le32 *q, int de
1028 ext2_free_data(inode, p, q); 1066 ext2_free_data(inode, p, q);
1029} 1067}
1030 1068
1031void ext2_truncate(struct inode *inode) 1069static void __ext2_truncate_blocks(struct inode *inode, loff_t offset)
1032{ 1070{
1033 __le32 *i_data = EXT2_I(inode)->i_data; 1071 __le32 *i_data = EXT2_I(inode)->i_data;
1034 struct ext2_inode_info *ei = EXT2_I(inode); 1072 struct ext2_inode_info *ei = EXT2_I(inode);
@@ -1040,27 +1078,8 @@ void ext2_truncate(struct inode *inode)
1040 int n; 1078 int n;
1041 long iblock; 1079 long iblock;
1042 unsigned blocksize; 1080 unsigned blocksize;
1043
1044 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
1045 S_ISLNK(inode->i_mode)))
1046 return;
1047 if (ext2_inode_is_fast_symlink(inode))
1048 return;
1049 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
1050 return;
1051
1052 blocksize = inode->i_sb->s_blocksize; 1081 blocksize = inode->i_sb->s_blocksize;
1053 iblock = (inode->i_size + blocksize-1) 1082 iblock = (offset + blocksize-1) >> EXT2_BLOCK_SIZE_BITS(inode->i_sb);
1054 >> EXT2_BLOCK_SIZE_BITS(inode->i_sb);
1055
1056 if (mapping_is_xip(inode->i_mapping))
1057 xip_truncate_page(inode->i_mapping, inode->i_size);
1058 else if (test_opt(inode->i_sb, NOBH))
1059 nobh_truncate_page(inode->i_mapping,
1060 inode->i_size, ext2_get_block);
1061 else
1062 block_truncate_page(inode->i_mapping,
1063 inode->i_size, ext2_get_block);
1064 1083
1065 n = ext2_block_to_path(inode, iblock, offsets, NULL); 1084 n = ext2_block_to_path(inode, iblock, offsets, NULL);
1066 if (n == 0) 1085 if (n == 0)
@@ -1128,6 +1147,62 @@ do_indirects:
1128 ext2_discard_reservation(inode); 1147 ext2_discard_reservation(inode);
1129 1148
1130 mutex_unlock(&ei->truncate_mutex); 1149 mutex_unlock(&ei->truncate_mutex);
1150}
1151
1152static void ext2_truncate_blocks(struct inode *inode, loff_t offset)
1153{
1154 /*
1155 * XXX: it seems like a bug here that we don't allow
1156 * IS_APPEND inode to have blocks-past-i_size trimmed off.
1157 * review and fix this.
1158 *
1159 * Also would be nice to be able to handle IO errors and such,
1160 * but that's probably too much to ask.
1161 */
1162 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
1163 S_ISLNK(inode->i_mode)))
1164 return;
1165 if (ext2_inode_is_fast_symlink(inode))
1166 return;
1167 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
1168 return;
1169 __ext2_truncate_blocks(inode, offset);
1170}
1171
1172int ext2_setsize(struct inode *inode, loff_t newsize)
1173{
1174 loff_t oldsize;
1175 int error;
1176
1177 error = inode_newsize_ok(inode, newsize);
1178 if (error)
1179 return error;
1180
1181 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
1182 S_ISLNK(inode->i_mode)))
1183 return -EINVAL;
1184 if (ext2_inode_is_fast_symlink(inode))
1185 return -EINVAL;
1186 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
1187 return -EPERM;
1188
1189 if (mapping_is_xip(inode->i_mapping))
1190 error = xip_truncate_page(inode->i_mapping, newsize);
1191 else if (test_opt(inode->i_sb, NOBH))
1192 error = nobh_truncate_page(inode->i_mapping,
1193 newsize, ext2_get_block);
1194 else
1195 error = block_truncate_page(inode->i_mapping,
1196 newsize, ext2_get_block);
1197 if (error)
1198 return error;
1199
1200 oldsize = inode->i_size;
1201 i_size_write(inode, newsize);
1202 truncate_pagecache(inode, oldsize, newsize);
1203
1204 __ext2_truncate_blocks(inode, newsize);
1205
1131 inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; 1206 inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
1132 if (inode_needs_sync(inode)) { 1207 if (inode_needs_sync(inode)) {
1133 sync_mapping_buffers(inode->i_mapping); 1208 sync_mapping_buffers(inode->i_mapping);
@@ -1135,6 +1210,8 @@ do_indirects:
1135 } else { 1210 } else {
1136 mark_inode_dirty(inode); 1211 mark_inode_dirty(inode);
1137 } 1212 }
1213
1214 return 0;
1138} 1215}
1139 1216
1140static struct ext2_inode *ext2_get_inode(struct super_block *sb, ino_t ino, 1217static struct ext2_inode *ext2_get_inode(struct super_block *sb, ino_t ino,
@@ -1406,11 +1483,11 @@ static int __ext2_write_inode(struct inode *inode, int do_sync)
1406 /* If this is the first large file 1483 /* If this is the first large file
1407 * created, add a flag to the superblock. 1484 * created, add a flag to the superblock.
1408 */ 1485 */
1409 lock_kernel(); 1486 spin_lock(&EXT2_SB(sb)->s_lock);
1410 ext2_update_dynamic_rev(sb); 1487 ext2_update_dynamic_rev(sb);
1411 EXT2_SET_RO_COMPAT_FEATURE(sb, 1488 EXT2_SET_RO_COMPAT_FEATURE(sb,
1412 EXT2_FEATURE_RO_COMPAT_LARGE_FILE); 1489 EXT2_FEATURE_RO_COMPAT_LARGE_FILE);
1413 unlock_kernel(); 1490 spin_unlock(&EXT2_SB(sb)->s_lock);
1414 ext2_write_super(sb); 1491 ext2_write_super(sb);
1415 } 1492 }
1416 } 1493 }
@@ -1467,7 +1544,7 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
1467 if (error) 1544 if (error)
1468 return error; 1545 return error;
1469 1546
1470 if (iattr->ia_valid & ATTR_SIZE) 1547 if (is_quota_modification(inode, iattr))
1471 dquot_initialize(inode); 1548 dquot_initialize(inode);
1472 if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) || 1549 if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
1473 (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) { 1550 (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
@@ -1475,8 +1552,15 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
1475 if (error) 1552 if (error)
1476 return error; 1553 return error;
1477 } 1554 }
1478 error = inode_setattr(inode, iattr); 1555 if (iattr->ia_valid & ATTR_SIZE) {
1479 if (!error && (iattr->ia_valid & ATTR_MODE)) 1556 error = ext2_setsize(inode, iattr->ia_size);
1557 if (error)
1558 return error;
1559 }
1560 generic_setattr(inode, iattr);
1561 if (iattr->ia_valid & ATTR_MODE)
1480 error = ext2_acl_chmod(inode); 1562 error = ext2_acl_chmod(inode);
1563 mark_inode_dirty(inode);
1564
1481 return error; 1565 return error;
1482} 1566}
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 42e4a303b675..7ff43f4a59cd 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -26,7 +26,6 @@
26#include <linux/random.h> 26#include <linux/random.h>
27#include <linux/buffer_head.h> 27#include <linux/buffer_head.h>
28#include <linux/exportfs.h> 28#include <linux/exportfs.h>
29#include <linux/smp_lock.h>
30#include <linux/vfs.h> 29#include <linux/vfs.h>
31#include <linux/seq_file.h> 30#include <linux/seq_file.h>
32#include <linux/mount.h> 31#include <linux/mount.h>
@@ -39,7 +38,7 @@
39#include "xip.h" 38#include "xip.h"
40 39
41static void ext2_sync_super(struct super_block *sb, 40static void ext2_sync_super(struct super_block *sb,
42 struct ext2_super_block *es); 41 struct ext2_super_block *es, int wait);
43static int ext2_remount (struct super_block * sb, int * flags, char * data); 42static int ext2_remount (struct super_block * sb, int * flags, char * data);
44static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf); 43static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf);
45static int ext2_sync_fs(struct super_block *sb, int wait); 44static int ext2_sync_fs(struct super_block *sb, int wait);
@@ -52,9 +51,11 @@ void ext2_error (struct super_block * sb, const char * function,
52 struct ext2_super_block *es = sbi->s_es; 51 struct ext2_super_block *es = sbi->s_es;
53 52
54 if (!(sb->s_flags & MS_RDONLY)) { 53 if (!(sb->s_flags & MS_RDONLY)) {
54 spin_lock(&sbi->s_lock);
55 sbi->s_mount_state |= EXT2_ERROR_FS; 55 sbi->s_mount_state |= EXT2_ERROR_FS;
56 es->s_state |= cpu_to_le16(EXT2_ERROR_FS); 56 es->s_state |= cpu_to_le16(EXT2_ERROR_FS);
57 ext2_sync_super(sb, es); 57 spin_unlock(&sbi->s_lock);
58 ext2_sync_super(sb, es, 1);
58 } 59 }
59 60
60 va_start(args, fmt); 61 va_start(args, fmt);
@@ -84,6 +85,9 @@ void ext2_msg(struct super_block *sb, const char *prefix,
84 va_end(args); 85 va_end(args);
85} 86}
86 87
88/*
89 * This must be called with sbi->s_lock held.
90 */
87void ext2_update_dynamic_rev(struct super_block *sb) 91void ext2_update_dynamic_rev(struct super_block *sb)
88{ 92{
89 struct ext2_super_block *es = EXT2_SB(sb)->s_es; 93 struct ext2_super_block *es = EXT2_SB(sb)->s_es;
@@ -115,7 +119,7 @@ static void ext2_put_super (struct super_block * sb)
115 int i; 119 int i;
116 struct ext2_sb_info *sbi = EXT2_SB(sb); 120 struct ext2_sb_info *sbi = EXT2_SB(sb);
117 121
118 lock_kernel(); 122 dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
119 123
120 if (sb->s_dirt) 124 if (sb->s_dirt)
121 ext2_write_super(sb); 125 ext2_write_super(sb);
@@ -124,8 +128,10 @@ static void ext2_put_super (struct super_block * sb)
124 if (!(sb->s_flags & MS_RDONLY)) { 128 if (!(sb->s_flags & MS_RDONLY)) {
125 struct ext2_super_block *es = sbi->s_es; 129 struct ext2_super_block *es = sbi->s_es;
126 130
131 spin_lock(&sbi->s_lock);
127 es->s_state = cpu_to_le16(sbi->s_mount_state); 132 es->s_state = cpu_to_le16(sbi->s_mount_state);
128 ext2_sync_super(sb, es); 133 spin_unlock(&sbi->s_lock);
134 ext2_sync_super(sb, es, 1);
129 } 135 }
130 db_count = sbi->s_gdb_count; 136 db_count = sbi->s_gdb_count;
131 for (i = 0; i < db_count; i++) 137 for (i = 0; i < db_count; i++)
@@ -140,8 +146,6 @@ static void ext2_put_super (struct super_block * sb)
140 sb->s_fs_info = NULL; 146 sb->s_fs_info = NULL;
141 kfree(sbi->s_blockgroup_lock); 147 kfree(sbi->s_blockgroup_lock);
142 kfree(sbi); 148 kfree(sbi);
143
144 unlock_kernel();
145} 149}
146 150
147static struct kmem_cache * ext2_inode_cachep; 151static struct kmem_cache * ext2_inode_cachep;
@@ -209,6 +213,7 @@ static int ext2_show_options(struct seq_file *seq, struct vfsmount *vfs)
209 struct ext2_super_block *es = sbi->s_es; 213 struct ext2_super_block *es = sbi->s_es;
210 unsigned long def_mount_opts; 214 unsigned long def_mount_opts;
211 215
216 spin_lock(&sbi->s_lock);
212 def_mount_opts = le32_to_cpu(es->s_default_mount_opts); 217 def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
213 218
214 if (sbi->s_sb_block != 1) 219 if (sbi->s_sb_block != 1)
@@ -281,6 +286,7 @@ static int ext2_show_options(struct seq_file *seq, struct vfsmount *vfs)
281 if (!test_opt(sb, RESERVATION)) 286 if (!test_opt(sb, RESERVATION))
282 seq_puts(seq, ",noreservation"); 287 seq_puts(seq, ",noreservation");
283 288
289 spin_unlock(&sbi->s_lock);
284 return 0; 290 return 0;
285} 291}
286 292
@@ -606,7 +612,6 @@ static int ext2_setup_super (struct super_block * sb,
606 if (!le16_to_cpu(es->s_max_mnt_count)) 612 if (!le16_to_cpu(es->s_max_mnt_count))
607 es->s_max_mnt_count = cpu_to_le16(EXT2_DFL_MAX_MNT_COUNT); 613 es->s_max_mnt_count = cpu_to_le16(EXT2_DFL_MAX_MNT_COUNT);
608 le16_add_cpu(&es->s_mnt_count, 1); 614 le16_add_cpu(&es->s_mnt_count, 1);
609 ext2_write_super(sb);
610 if (test_opt (sb, DEBUG)) 615 if (test_opt (sb, DEBUG))
611 ext2_msg(sb, KERN_INFO, "%s, %s, bs=%lu, fs=%lu, gc=%lu, " 616 ext2_msg(sb, KERN_INFO, "%s, %s, bs=%lu, fs=%lu, gc=%lu, "
612 "bpg=%lu, ipg=%lu, mo=%04lx]", 617 "bpg=%lu, ipg=%lu, mo=%04lx]",
@@ -767,6 +772,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
767 sb->s_fs_info = sbi; 772 sb->s_fs_info = sbi;
768 sbi->s_sb_block = sb_block; 773 sbi->s_sb_block = sb_block;
769 774
775 spin_lock_init(&sbi->s_lock);
776
770 /* 777 /*
771 * See what the current blocksize for the device is, and 778 * See what the current blocksize for the device is, and
772 * use that as the blocksize. Otherwise (or if the blocksize 779 * use that as the blocksize. Otherwise (or if the blocksize
@@ -1058,6 +1065,12 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
1058 sb->s_op = &ext2_sops; 1065 sb->s_op = &ext2_sops;
1059 sb->s_export_op = &ext2_export_ops; 1066 sb->s_export_op = &ext2_export_ops;
1060 sb->s_xattr = ext2_xattr_handlers; 1067 sb->s_xattr = ext2_xattr_handlers;
1068
1069#ifdef CONFIG_QUOTA
1070 sb->dq_op = &dquot_operations;
1071 sb->s_qcop = &dquot_quotactl_ops;
1072#endif
1073
1061 root = ext2_iget(sb, EXT2_ROOT_INO); 1074 root = ext2_iget(sb, EXT2_ROOT_INO);
1062 if (IS_ERR(root)) { 1075 if (IS_ERR(root)) {
1063 ret = PTR_ERR(root); 1076 ret = PTR_ERR(root);
@@ -1079,7 +1092,9 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
1079 if (EXT2_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) 1092 if (EXT2_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL))
1080 ext2_msg(sb, KERN_WARNING, 1093 ext2_msg(sb, KERN_WARNING,
1081 "warning: mounting ext3 filesystem as ext2"); 1094 "warning: mounting ext3 filesystem as ext2");
1082 ext2_setup_super (sb, es, sb->s_flags & MS_RDONLY); 1095 if (ext2_setup_super (sb, es, sb->s_flags & MS_RDONLY))
1096 sb->s_flags |= MS_RDONLY;
1097 ext2_write_super(sb);
1083 return 0; 1098 return 0;
1084 1099
1085cantfind_ext2: 1100cantfind_ext2:
@@ -1120,30 +1135,26 @@ static void ext2_clear_super_error(struct super_block *sb)
1120 * be remapped. Nothing we can do but to retry the 1135 * be remapped. Nothing we can do but to retry the
1121 * write and hope for the best. 1136 * write and hope for the best.
1122 */ 1137 */
1123 printk(KERN_ERR "EXT2-fs: %s previous I/O error to " 1138 ext2_msg(sb, KERN_ERR,
1124 "superblock detected", sb->s_id); 1139 "previous I/O error to superblock detected\n");
1125 clear_buffer_write_io_error(sbh); 1140 clear_buffer_write_io_error(sbh);
1126 set_buffer_uptodate(sbh); 1141 set_buffer_uptodate(sbh);
1127 } 1142 }
1128} 1143}
1129 1144
1130static void ext2_commit_super (struct super_block * sb, 1145static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es,
1131 struct ext2_super_block * es) 1146 int wait)
1132{
1133 ext2_clear_super_error(sb);
1134 es->s_wtime = cpu_to_le32(get_seconds());
1135 mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
1136 sb->s_dirt = 0;
1137}
1138
1139static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es)
1140{ 1147{
1141 ext2_clear_super_error(sb); 1148 ext2_clear_super_error(sb);
1149 spin_lock(&EXT2_SB(sb)->s_lock);
1142 es->s_free_blocks_count = cpu_to_le32(ext2_count_free_blocks(sb)); 1150 es->s_free_blocks_count = cpu_to_le32(ext2_count_free_blocks(sb));
1143 es->s_free_inodes_count = cpu_to_le32(ext2_count_free_inodes(sb)); 1151 es->s_free_inodes_count = cpu_to_le32(ext2_count_free_inodes(sb));
1144 es->s_wtime = cpu_to_le32(get_seconds()); 1152 es->s_wtime = cpu_to_le32(get_seconds());
1153 /* unlock before we do IO */
1154 spin_unlock(&EXT2_SB(sb)->s_lock);
1145 mark_buffer_dirty(EXT2_SB(sb)->s_sbh); 1155 mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
1146 sync_dirty_buffer(EXT2_SB(sb)->s_sbh); 1156 if (wait)
1157 sync_dirty_buffer(EXT2_SB(sb)->s_sbh);
1147 sb->s_dirt = 0; 1158 sb->s_dirt = 0;
1148} 1159}
1149 1160
@@ -1157,43 +1168,18 @@ static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es)
1157 * may have been checked while mounted and e2fsck may have 1168 * may have been checked while mounted and e2fsck may have
1158 * set s_state to EXT2_VALID_FS after some corrections. 1169 * set s_state to EXT2_VALID_FS after some corrections.
1159 */ 1170 */
1160
1161static int ext2_sync_fs(struct super_block *sb, int wait) 1171static int ext2_sync_fs(struct super_block *sb, int wait)
1162{ 1172{
1173 struct ext2_sb_info *sbi = EXT2_SB(sb);
1163 struct ext2_super_block *es = EXT2_SB(sb)->s_es; 1174 struct ext2_super_block *es = EXT2_SB(sb)->s_es;
1164 struct buffer_head *sbh = EXT2_SB(sb)->s_sbh;
1165
1166 lock_kernel();
1167 if (buffer_write_io_error(sbh)) {
1168 /*
1169 * Oh, dear. A previous attempt to write the
1170 * superblock failed. This could happen because the
1171 * USB device was yanked out. Or it could happen to
1172 * be a transient write error and maybe the block will
1173 * be remapped. Nothing we can do but to retry the
1174 * write and hope for the best.
1175 */
1176 ext2_msg(sb, KERN_ERR,
1177 "previous I/O error to superblock detected\n");
1178 clear_buffer_write_io_error(sbh);
1179 set_buffer_uptodate(sbh);
1180 }
1181 1175
1176 spin_lock(&sbi->s_lock);
1182 if (es->s_state & cpu_to_le16(EXT2_VALID_FS)) { 1177 if (es->s_state & cpu_to_le16(EXT2_VALID_FS)) {
1183 ext2_debug("setting valid to 0\n"); 1178 ext2_debug("setting valid to 0\n");
1184 es->s_state &= cpu_to_le16(~EXT2_VALID_FS); 1179 es->s_state &= cpu_to_le16(~EXT2_VALID_FS);
1185 es->s_free_blocks_count =
1186 cpu_to_le32(ext2_count_free_blocks(sb));
1187 es->s_free_inodes_count =
1188 cpu_to_le32(ext2_count_free_inodes(sb));
1189 es->s_mtime = cpu_to_le32(get_seconds());
1190 ext2_sync_super(sb, es);
1191 } else {
1192 ext2_commit_super(sb, es);
1193 } 1180 }
1194 sb->s_dirt = 0; 1181 spin_unlock(&sbi->s_lock);
1195 unlock_kernel(); 1182 ext2_sync_super(sb, es, wait);
1196
1197 return 0; 1183 return 0;
1198} 1184}
1199 1185
@@ -1215,7 +1201,7 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
1215 unsigned long old_sb_flags; 1201 unsigned long old_sb_flags;
1216 int err; 1202 int err;
1217 1203
1218 lock_kernel(); 1204 spin_lock(&sbi->s_lock);
1219 1205
1220 /* Store the old options */ 1206 /* Store the old options */
1221 old_sb_flags = sb->s_flags; 1207 old_sb_flags = sb->s_flags;
@@ -1254,21 +1240,31 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
1254 sbi->s_mount_opt |= old_mount_opt & EXT2_MOUNT_XIP; 1240 sbi->s_mount_opt |= old_mount_opt & EXT2_MOUNT_XIP;
1255 } 1241 }
1256 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) { 1242 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
1257 unlock_kernel(); 1243 spin_unlock(&sbi->s_lock);
1258 return 0; 1244 return 0;
1259 } 1245 }
1260 if (*flags & MS_RDONLY) { 1246 if (*flags & MS_RDONLY) {
1261 if (le16_to_cpu(es->s_state) & EXT2_VALID_FS || 1247 if (le16_to_cpu(es->s_state) & EXT2_VALID_FS ||
1262 !(sbi->s_mount_state & EXT2_VALID_FS)) { 1248 !(sbi->s_mount_state & EXT2_VALID_FS)) {
1263 unlock_kernel(); 1249 spin_unlock(&sbi->s_lock);
1264 return 0; 1250 return 0;
1265 } 1251 }
1252
1266 /* 1253 /*
1267 * OK, we are remounting a valid rw partition rdonly, so set 1254 * OK, we are remounting a valid rw partition rdonly, so set
1268 * the rdonly flag and then mark the partition as valid again. 1255 * the rdonly flag and then mark the partition as valid again.
1269 */ 1256 */
1270 es->s_state = cpu_to_le16(sbi->s_mount_state); 1257 es->s_state = cpu_to_le16(sbi->s_mount_state);
1271 es->s_mtime = cpu_to_le32(get_seconds()); 1258 es->s_mtime = cpu_to_le32(get_seconds());
1259 spin_unlock(&sbi->s_lock);
1260
1261 err = dquot_suspend(sb, -1);
1262 if (err < 0) {
1263 spin_lock(&sbi->s_lock);
1264 goto restore_opts;
1265 }
1266
1267 ext2_sync_super(sb, es, 1);
1272 } else { 1268 } else {
1273 __le32 ret = EXT2_HAS_RO_COMPAT_FEATURE(sb, 1269 __le32 ret = EXT2_HAS_RO_COMPAT_FEATURE(sb,
1274 ~EXT2_FEATURE_RO_COMPAT_SUPP); 1270 ~EXT2_FEATURE_RO_COMPAT_SUPP);
@@ -1288,16 +1284,20 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
1288 sbi->s_mount_state = le16_to_cpu(es->s_state); 1284 sbi->s_mount_state = le16_to_cpu(es->s_state);
1289 if (!ext2_setup_super (sb, es, 0)) 1285 if (!ext2_setup_super (sb, es, 0))
1290 sb->s_flags &= ~MS_RDONLY; 1286 sb->s_flags &= ~MS_RDONLY;
1287 spin_unlock(&sbi->s_lock);
1288
1289 ext2_write_super(sb);
1290
1291 dquot_resume(sb, -1);
1291 } 1292 }
1292 ext2_sync_super(sb, es); 1293
1293 unlock_kernel();
1294 return 0; 1294 return 0;
1295restore_opts: 1295restore_opts:
1296 sbi->s_mount_opt = old_opts.s_mount_opt; 1296 sbi->s_mount_opt = old_opts.s_mount_opt;
1297 sbi->s_resuid = old_opts.s_resuid; 1297 sbi->s_resuid = old_opts.s_resuid;
1298 sbi->s_resgid = old_opts.s_resgid; 1298 sbi->s_resgid = old_opts.s_resgid;
1299 sb->s_flags = old_sb_flags; 1299 sb->s_flags = old_sb_flags;
1300 unlock_kernel(); 1300 spin_unlock(&sbi->s_lock);
1301 return err; 1301 return err;
1302} 1302}
1303 1303
@@ -1308,6 +1308,8 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf)
1308 struct ext2_super_block *es = sbi->s_es; 1308 struct ext2_super_block *es = sbi->s_es;
1309 u64 fsid; 1309 u64 fsid;
1310 1310
1311 spin_lock(&sbi->s_lock);
1312
1311 if (test_opt (sb, MINIX_DF)) 1313 if (test_opt (sb, MINIX_DF))
1312 sbi->s_overhead_last = 0; 1314 sbi->s_overhead_last = 0;
1313 else if (sbi->s_blocks_last != le32_to_cpu(es->s_blocks_count)) { 1315 else if (sbi->s_blocks_last != le32_to_cpu(es->s_blocks_count)) {
@@ -1362,6 +1364,7 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf)
1362 le64_to_cpup((void *)es->s_uuid + sizeof(u64)); 1364 le64_to_cpup((void *)es->s_uuid + sizeof(u64));
1363 buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL; 1365 buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
1364 buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL; 1366 buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
1367 spin_unlock(&sbi->s_lock);
1365 return 0; 1368 return 0;
1366} 1369}
1367 1370
diff --git a/fs/ext2/symlink.c b/fs/ext2/symlink.c
index 4e2426e22bbe..565cf817bbf1 100644
--- a/fs/ext2/symlink.c
+++ b/fs/ext2/symlink.c
@@ -32,6 +32,7 @@ const struct inode_operations ext2_symlink_inode_operations = {
32 .readlink = generic_readlink, 32 .readlink = generic_readlink,
33 .follow_link = page_follow_link_light, 33 .follow_link = page_follow_link_light,
34 .put_link = page_put_link, 34 .put_link = page_put_link,
35 .setattr = ext2_setattr,
35#ifdef CONFIG_EXT2_FS_XATTR 36#ifdef CONFIG_EXT2_FS_XATTR
36 .setxattr = generic_setxattr, 37 .setxattr = generic_setxattr,
37 .getxattr = generic_getxattr, 38 .getxattr = generic_getxattr,
@@ -43,6 +44,7 @@ const struct inode_operations ext2_symlink_inode_operations = {
43const struct inode_operations ext2_fast_symlink_inode_operations = { 44const struct inode_operations ext2_fast_symlink_inode_operations = {
44 .readlink = generic_readlink, 45 .readlink = generic_readlink,
45 .follow_link = ext2_follow_link, 46 .follow_link = ext2_follow_link,
47 .setattr = ext2_setattr,
46#ifdef CONFIG_EXT2_FS_XATTR 48#ifdef CONFIG_EXT2_FS_XATTR
47 .setxattr = generic_setxattr, 49 .setxattr = generic_setxattr,
48 .getxattr = generic_getxattr, 50 .getxattr = generic_getxattr,
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index e44dc92609be..7c3915780b19 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -101,7 +101,7 @@ static void ext2_xattr_rehash(struct ext2_xattr_header *,
101 101
102static struct mb_cache *ext2_xattr_cache; 102static struct mb_cache *ext2_xattr_cache;
103 103
104static struct xattr_handler *ext2_xattr_handler_map[] = { 104static const struct xattr_handler *ext2_xattr_handler_map[] = {
105 [EXT2_XATTR_INDEX_USER] = &ext2_xattr_user_handler, 105 [EXT2_XATTR_INDEX_USER] = &ext2_xattr_user_handler,
106#ifdef CONFIG_EXT2_FS_POSIX_ACL 106#ifdef CONFIG_EXT2_FS_POSIX_ACL
107 [EXT2_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext2_xattr_acl_access_handler, 107 [EXT2_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext2_xattr_acl_access_handler,
@@ -113,7 +113,7 @@ static struct xattr_handler *ext2_xattr_handler_map[] = {
113#endif 113#endif
114}; 114};
115 115
116struct xattr_handler *ext2_xattr_handlers[] = { 116const struct xattr_handler *ext2_xattr_handlers[] = {
117 &ext2_xattr_user_handler, 117 &ext2_xattr_user_handler,
118 &ext2_xattr_trusted_handler, 118 &ext2_xattr_trusted_handler,
119#ifdef CONFIG_EXT2_FS_POSIX_ACL 119#ifdef CONFIG_EXT2_FS_POSIX_ACL
@@ -126,10 +126,10 @@ struct xattr_handler *ext2_xattr_handlers[] = {
126 NULL 126 NULL
127}; 127};
128 128
129static inline struct xattr_handler * 129static inline const struct xattr_handler *
130ext2_xattr_handler(int name_index) 130ext2_xattr_handler(int name_index)
131{ 131{
132 struct xattr_handler *handler = NULL; 132 const struct xattr_handler *handler = NULL;
133 133
134 if (name_index > 0 && name_index < ARRAY_SIZE(ext2_xattr_handler_map)) 134 if (name_index > 0 && name_index < ARRAY_SIZE(ext2_xattr_handler_map))
135 handler = ext2_xattr_handler_map[name_index]; 135 handler = ext2_xattr_handler_map[name_index];
@@ -298,7 +298,7 @@ bad_block: ext2_error(inode->i_sb, "ext2_xattr_list",
298 /* list the attribute names */ 298 /* list the attribute names */
299 for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry); 299 for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry);
300 entry = EXT2_XATTR_NEXT(entry)) { 300 entry = EXT2_XATTR_NEXT(entry)) {
301 struct xattr_handler *handler = 301 const struct xattr_handler *handler =
302 ext2_xattr_handler(entry->e_name_index); 302 ext2_xattr_handler(entry->e_name_index);
303 303
304 if (handler) { 304 if (handler) {
@@ -345,7 +345,9 @@ static void ext2_xattr_update_super_block(struct super_block *sb)
345 if (EXT2_HAS_COMPAT_FEATURE(sb, EXT2_FEATURE_COMPAT_EXT_ATTR)) 345 if (EXT2_HAS_COMPAT_FEATURE(sb, EXT2_FEATURE_COMPAT_EXT_ATTR))
346 return; 346 return;
347 347
348 spin_lock(&EXT2_SB(sb)->s_lock);
348 EXT2_SET_COMPAT_FEATURE(sb, EXT2_FEATURE_COMPAT_EXT_ATTR); 349 EXT2_SET_COMPAT_FEATURE(sb, EXT2_FEATURE_COMPAT_EXT_ATTR);
350 spin_unlock(&EXT2_SB(sb)->s_lock);
349 sb->s_dirt = 1; 351 sb->s_dirt = 1;
350 mark_buffer_dirty(EXT2_SB(sb)->s_sbh); 352 mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
351} 353}
diff --git a/fs/ext2/xattr.h b/fs/ext2/xattr.h
index bf8175b2ced9..a1a1c2184616 100644
--- a/fs/ext2/xattr.h
+++ b/fs/ext2/xattr.h
@@ -55,11 +55,11 @@ struct ext2_xattr_entry {
55 55
56# ifdef CONFIG_EXT2_FS_XATTR 56# ifdef CONFIG_EXT2_FS_XATTR
57 57
58extern struct xattr_handler ext2_xattr_user_handler; 58extern const struct xattr_handler ext2_xattr_user_handler;
59extern struct xattr_handler ext2_xattr_trusted_handler; 59extern const struct xattr_handler ext2_xattr_trusted_handler;
60extern struct xattr_handler ext2_xattr_acl_access_handler; 60extern const struct xattr_handler ext2_xattr_acl_access_handler;
61extern struct xattr_handler ext2_xattr_acl_default_handler; 61extern const struct xattr_handler ext2_xattr_acl_default_handler;
62extern struct xattr_handler ext2_xattr_security_handler; 62extern const struct xattr_handler ext2_xattr_security_handler;
63 63
64extern ssize_t ext2_listxattr(struct dentry *, char *, size_t); 64extern ssize_t ext2_listxattr(struct dentry *, char *, size_t);
65 65
@@ -72,7 +72,7 @@ extern void ext2_xattr_put_super(struct super_block *);
72extern int init_ext2_xattr(void); 72extern int init_ext2_xattr(void);
73extern void exit_ext2_xattr(void); 73extern void exit_ext2_xattr(void);
74 74
75extern struct xattr_handler *ext2_xattr_handlers[]; 75extern const struct xattr_handler *ext2_xattr_handlers[];
76 76
77# else /* CONFIG_EXT2_FS_XATTR */ 77# else /* CONFIG_EXT2_FS_XATTR */
78 78
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index c8155845ac05..3004e15d5da5 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -4,6 +4,7 @@
4 */ 4 */
5 5
6#include <linux/module.h> 6#include <linux/module.h>
7#include <linux/slab.h>
7#include <linux/string.h> 8#include <linux/string.h>
8#include <linux/fs.h> 9#include <linux/fs.h>
9#include <linux/ext2_fs.h> 10#include <linux/ext2_fs.h>
@@ -66,7 +67,7 @@ ext2_init_security(struct inode *inode, struct inode *dir)
66 return err; 67 return err;
67} 68}
68 69
69struct xattr_handler ext2_xattr_security_handler = { 70const struct xattr_handler ext2_xattr_security_handler = {
70 .prefix = XATTR_SECURITY_PREFIX, 71 .prefix = XATTR_SECURITY_PREFIX,
71 .list = ext2_xattr_security_list, 72 .list = ext2_xattr_security_list,
72 .get = ext2_xattr_security_get, 73 .get = ext2_xattr_security_get,
diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c
index 2a26d71f4771..667e46a8d62d 100644
--- a/fs/ext2/xattr_trusted.c
+++ b/fs/ext2/xattr_trusted.c
@@ -50,7 +50,7 @@ ext2_xattr_trusted_set(struct dentry *dentry, const char *name,
50 value, size, flags); 50 value, size, flags);
51} 51}
52 52
53struct xattr_handler ext2_xattr_trusted_handler = { 53const struct xattr_handler ext2_xattr_trusted_handler = {
54 .prefix = XATTR_TRUSTED_PREFIX, 54 .prefix = XATTR_TRUSTED_PREFIX,
55 .list = ext2_xattr_trusted_list, 55 .list = ext2_xattr_trusted_list,
56 .get = ext2_xattr_trusted_get, 56 .get = ext2_xattr_trusted_get,
diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c
index 3f6caf3684b4..099d20f47163 100644
--- a/fs/ext2/xattr_user.c
+++ b/fs/ext2/xattr_user.c
@@ -54,7 +54,7 @@ ext2_xattr_user_set(struct dentry *dentry, const char *name,
54 name, value, size, flags); 54 name, value, size, flags);
55} 55}
56 56
57struct xattr_handler ext2_xattr_user_handler = { 57const struct xattr_handler ext2_xattr_user_handler = {
58 .prefix = XATTR_USER_PREFIX, 58 .prefix = XATTR_USER_PREFIX,
59 .list = ext2_xattr_user_list, 59 .list = ext2_xattr_user_list,
60 .get = ext2_xattr_user_get, 60 .get = ext2_xattr_user_get,
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index 82ba34158661..01552abbca3c 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -456,7 +456,7 @@ release_and_out:
456 return error; 456 return error;
457} 457}
458 458
459struct xattr_handler ext3_xattr_acl_access_handler = { 459const struct xattr_handler ext3_xattr_acl_access_handler = {
460 .prefix = POSIX_ACL_XATTR_ACCESS, 460 .prefix = POSIX_ACL_XATTR_ACCESS,
461 .flags = ACL_TYPE_ACCESS, 461 .flags = ACL_TYPE_ACCESS,
462 .list = ext3_xattr_list_acl_access, 462 .list = ext3_xattr_list_acl_access,
@@ -464,7 +464,7 @@ struct xattr_handler ext3_xattr_acl_access_handler = {
464 .set = ext3_xattr_set_acl, 464 .set = ext3_xattr_set_acl,
465}; 465};
466 466
467struct xattr_handler ext3_xattr_acl_default_handler = { 467const struct xattr_handler ext3_xattr_acl_default_handler = {
468 .prefix = POSIX_ACL_XATTR_DEFAULT, 468 .prefix = POSIX_ACL_XATTR_DEFAULT,
469 .flags = ACL_TYPE_DEFAULT, 469 .flags = ACL_TYPE_DEFAULT,
470 .list = ext3_xattr_list_acl_default, 470 .list = ext3_xattr_list_acl_default,
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 161da2d3f890..4a32511f4ded 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -14,6 +14,7 @@
14#include <linux/time.h> 14#include <linux/time.h>
15#include <linux/capability.h> 15#include <linux/capability.h>
16#include <linux/fs.h> 16#include <linux/fs.h>
17#include <linux/slab.h>
17#include <linux/jbd.h> 18#include <linux/jbd.h>
18#include <linux/ext3_fs.h> 19#include <linux/ext3_fs.h>
19#include <linux/ext3_jbd.h> 20#include <linux/ext3_jbd.h>
@@ -1583,6 +1584,12 @@ retry_alloc:
1583 goto io_error; 1584 goto io_error;
1584 free_blocks = le16_to_cpu(gdp->bg_free_blocks_count); 1585 free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
1585 /* 1586 /*
1587 * skip this group (and avoid loading bitmap) if there
1588 * are no free blocks
1589 */
1590 if (!free_blocks)
1591 continue;
1592 /*
1586 * skip this group if the number of 1593 * skip this group if the number of
1587 * free blocks is less than half of the reservation 1594 * free blocks is less than half of the reservation
1588 * window size. 1595 * window size.
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index 373fa90c796a..e2e72c367cf6 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -297,7 +297,7 @@ static void free_rb_tree_fname(struct rb_root *root)
297 kfree (old); 297 kfree (old);
298 } 298 }
299 if (!parent) 299 if (!parent)
300 root->rb_node = NULL; 300 *root = RB_ROOT;
301 else if (parent->rb_left == n) 301 else if (parent->rb_left == n)
302 parent->rb_left = NULL; 302 parent->rb_left = NULL;
303 else if (parent->rb_right == n) 303 else if (parent->rb_right == n)
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index 8209f266e9ad..d7e9f74dc3a6 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -43,12 +43,12 @@
43 * inode to disk. 43 * inode to disk.
44 */ 44 */
45 45
46int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync) 46int ext3_sync_file(struct file *file, int datasync)
47{ 47{
48 struct inode *inode = dentry->d_inode; 48 struct inode *inode = file->f_mapping->host;
49 struct ext3_inode_info *ei = EXT3_I(inode); 49 struct ext3_inode_info *ei = EXT3_I(inode);
50 journal_t *journal = EXT3_SB(inode->i_sb)->s_journal; 50 journal_t *journal = EXT3_SB(inode->i_sb)->s_journal;
51 int ret = 0; 51 int ret, needs_barrier = 0;
52 tid_t commit_tid; 52 tid_t commit_tid;
53 53
54 if (inode->i_sb->s_flags & MS_RDONLY) 54 if (inode->i_sb->s_flags & MS_RDONLY)
@@ -70,28 +70,27 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
70 * (they were dirtied by commit). But that's OK - the blocks are 70 * (they were dirtied by commit). But that's OK - the blocks are
71 * safe in-journal, which is all fsync() needs to ensure. 71 * safe in-journal, which is all fsync() needs to ensure.
72 */ 72 */
73 if (ext3_should_journal_data(inode)) { 73 if (ext3_should_journal_data(inode))
74 ret = ext3_force_commit(inode->i_sb); 74 return ext3_force_commit(inode->i_sb);
75 goto out;
76 }
77 75
78 if (datasync) 76 if (datasync)
79 commit_tid = atomic_read(&ei->i_datasync_tid); 77 commit_tid = atomic_read(&ei->i_datasync_tid);
80 else 78 else
81 commit_tid = atomic_read(&ei->i_sync_tid); 79 commit_tid = atomic_read(&ei->i_sync_tid);
82 80
83 if (log_start_commit(journal, commit_tid)) { 81 if (test_opt(inode->i_sb, BARRIER) &&
84 log_wait_commit(journal, commit_tid); 82 !journal_trans_will_send_data_barrier(journal, commit_tid))
85 goto out; 83 needs_barrier = 1;
86 } 84 log_start_commit(journal, commit_tid);
85 ret = log_wait_commit(journal, commit_tid);
87 86
88 /* 87 /*
89 * In case we didn't commit a transaction, we have to flush 88 * In case we didn't commit a transaction, we have to flush
90 * disk caches manually so that data really is on persistent 89 * disk caches manually so that data really is on persistent
91 * storage 90 * storage
92 */ 91 */
93 if (test_opt(inode->i_sb, BARRIER)) 92 if (needs_barrier)
94 blkdev_issue_flush(inode->i_sb->s_bdev, NULL); 93 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
95out: 94 BLKDEV_IFL_WAIT);
96 return ret; 95 return ret;
97} 96}
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index ef9008b885b5..498021eb88fb 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -538,16 +538,13 @@ got:
538 if (S_ISDIR(mode)) 538 if (S_ISDIR(mode))
539 percpu_counter_inc(&sbi->s_dirs_counter); 539 percpu_counter_inc(&sbi->s_dirs_counter);
540 540
541 inode->i_uid = current_fsuid(); 541
542 if (test_opt (sb, GRPID)) 542 if (test_opt(sb, GRPID)) {
543 inode->i_gid = dir->i_gid; 543 inode->i_mode = mode;
544 else if (dir->i_mode & S_ISGID) { 544 inode->i_uid = current_fsuid();
545 inode->i_gid = dir->i_gid; 545 inode->i_gid = dir->i_gid;
546 if (S_ISDIR(mode))
547 mode |= S_ISGID;
548 } else 546 } else
549 inode->i_gid = current_fsgid(); 547 inode_init_owner(inode, dir, mode);
550 inode->i_mode = mode;
551 548
552 inode->i_ino = ino; 549 inode->i_ino = ino;
553 /* This is the optimal IO size (for stat), not the fs block size */ 550 /* This is the optimal IO size (for stat), not the fs block size */
@@ -582,7 +579,9 @@ got:
582 inode->i_generation = sbi->s_next_generation++; 579 inode->i_generation = sbi->s_next_generation++;
583 spin_unlock(&sbi->s_next_gen_lock); 580 spin_unlock(&sbi->s_next_gen_lock);
584 581
585 ei->i_state = EXT3_STATE_NEW; 582 ei->i_state_flags = 0;
583 ext3_set_inode_state(inode, EXT3_STATE_NEW);
584
586 ei->i_extra_isize = 585 ei->i_extra_isize =
587 (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) ? 586 (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) ?
588 sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE : 0; 587 sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE : 0;
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 7f920b7263a4..735f0190ec2a 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -2811,7 +2811,7 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
2811 inode->i_mtime.tv_sec = (signed)le32_to_cpu(raw_inode->i_mtime); 2811 inode->i_mtime.tv_sec = (signed)le32_to_cpu(raw_inode->i_mtime);
2812 inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0; 2812 inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0;
2813 2813
2814 ei->i_state = 0; 2814 ei->i_state_flags = 0;
2815 ei->i_dir_start_lookup = 0; 2815 ei->i_dir_start_lookup = 0;
2816 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); 2816 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
2817 /* We now have enough fields to check if the inode was active or not. 2817 /* We now have enough fields to check if the inode was active or not.
@@ -3151,7 +3151,7 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr)
3151 if (error) 3151 if (error)
3152 return error; 3152 return error;
3153 3153
3154 if (ia_valid & ATTR_SIZE) 3154 if (is_quota_modification(inode, attr))
3155 dquot_initialize(inode); 3155 dquot_initialize(inode);
3156 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || 3156 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
3157 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { 3157 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 1bee604cc6cd..6c953bb255e7 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -410,6 +410,8 @@ static void ext3_put_super (struct super_block * sb)
410 struct ext3_super_block *es = sbi->s_es; 410 struct ext3_super_block *es = sbi->s_es;
411 int i, err; 411 int i, err;
412 412
413 dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
414
413 lock_kernel(); 415 lock_kernel();
414 416
415 ext3_xattr_put_super(sb); 417 ext3_xattr_put_super(sb);
@@ -653,8 +655,12 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs)
653 seq_printf(seq, ",commit=%u", 655 seq_printf(seq, ",commit=%u",
654 (unsigned) (sbi->s_commit_interval / HZ)); 656 (unsigned) (sbi->s_commit_interval / HZ));
655 } 657 }
656 if (test_opt(sb, BARRIER)) 658
657 seq_puts(seq, ",barrier=1"); 659 /*
660 * Always display barrier state so it's clear what the status is.
661 */
662 seq_puts(seq, ",barrier=");
663 seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0");
658 if (test_opt(sb, NOBH)) 664 if (test_opt(sb, NOBH))
659 seq_puts(seq, ",nobh"); 665 seq_puts(seq, ",nobh");
660 666
@@ -744,7 +750,7 @@ static int ext3_release_dquot(struct dquot *dquot);
744static int ext3_mark_dquot_dirty(struct dquot *dquot); 750static int ext3_mark_dquot_dirty(struct dquot *dquot);
745static int ext3_write_info(struct super_block *sb, int type); 751static int ext3_write_info(struct super_block *sb, int type);
746static int ext3_quota_on(struct super_block *sb, int type, int format_id, 752static int ext3_quota_on(struct super_block *sb, int type, int format_id,
747 char *path, int remount); 753 char *path);
748static int ext3_quota_on_mount(struct super_block *sb, int type); 754static int ext3_quota_on_mount(struct super_block *sb, int type);
749static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data, 755static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data,
750 size_t len, loff_t off); 756 size_t len, loff_t off);
@@ -763,12 +769,12 @@ static const struct dquot_operations ext3_quota_operations = {
763 769
764static const struct quotactl_ops ext3_qctl_operations = { 770static const struct quotactl_ops ext3_qctl_operations = {
765 .quota_on = ext3_quota_on, 771 .quota_on = ext3_quota_on,
766 .quota_off = vfs_quota_off, 772 .quota_off = dquot_quota_off,
767 .quota_sync = vfs_quota_sync, 773 .quota_sync = dquot_quota_sync,
768 .get_info = vfs_get_dqinfo, 774 .get_info = dquot_get_dqinfo,
769 .set_info = vfs_set_dqinfo, 775 .set_info = dquot_set_dqinfo,
770 .get_dqblk = vfs_get_dqblk, 776 .get_dqblk = dquot_get_dqblk,
771 .set_dqblk = vfs_set_dqblk 777 .set_dqblk = dquot_set_dqblk
772}; 778};
773#endif 779#endif
774 780
@@ -810,8 +816,8 @@ enum {
810 Opt_data_err_abort, Opt_data_err_ignore, 816 Opt_data_err_abort, Opt_data_err_ignore,
811 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, 817 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
812 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, 818 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
813 Opt_noquota, Opt_ignore, Opt_barrier, Opt_err, Opt_resize, 819 Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err,
814 Opt_usrquota, Opt_grpquota 820 Opt_resize, Opt_usrquota, Opt_grpquota
815}; 821};
816 822
817static const match_table_t tokens = { 823static const match_table_t tokens = {
@@ -865,6 +871,8 @@ static const match_table_t tokens = {
865 {Opt_quota, "quota"}, 871 {Opt_quota, "quota"},
866 {Opt_usrquota, "usrquota"}, 872 {Opt_usrquota, "usrquota"},
867 {Opt_barrier, "barrier=%u"}, 873 {Opt_barrier, "barrier=%u"},
874 {Opt_barrier, "barrier"},
875 {Opt_nobarrier, "nobarrier"},
868 {Opt_resize, "resize"}, 876 {Opt_resize, "resize"},
869 {Opt_err, NULL}, 877 {Opt_err, NULL},
870}; 878};
@@ -967,7 +975,11 @@ static int parse_options (char *options, struct super_block *sb,
967 int token; 975 int token;
968 if (!*p) 976 if (!*p)
969 continue; 977 continue;
970 978 /*
979 * Initialize args struct so we know whether arg was
980 * found; some options take optional arguments.
981 */
982 args[0].to = args[0].from = 0;
971 token = match_token(p, tokens, args); 983 token = match_token(p, tokens, args);
972 switch (token) { 984 switch (token) {
973 case Opt_bsd_df: 985 case Opt_bsd_df:
@@ -1215,9 +1227,15 @@ set_qf_format:
1215 case Opt_abort: 1227 case Opt_abort:
1216 set_opt(sbi->s_mount_opt, ABORT); 1228 set_opt(sbi->s_mount_opt, ABORT);
1217 break; 1229 break;
1230 case Opt_nobarrier:
1231 clear_opt(sbi->s_mount_opt, BARRIER);
1232 break;
1218 case Opt_barrier: 1233 case Opt_barrier:
1219 if (match_int(&args[0], &option)) 1234 if (args[0].from) {
1220 return 0; 1235 if (match_int(&args[0], &option))
1236 return 0;
1237 } else
1238 option = 1; /* No argument, default to 1 */
1221 if (option) 1239 if (option)
1222 set_opt(sbi->s_mount_opt, BARRIER); 1240 set_opt(sbi->s_mount_opt, BARRIER);
1223 else 1241 else
@@ -1511,7 +1529,7 @@ static void ext3_orphan_cleanup (struct super_block * sb,
1511 /* Turn quotas off */ 1529 /* Turn quotas off */
1512 for (i = 0; i < MAXQUOTAS; i++) { 1530 for (i = 0; i < MAXQUOTAS; i++) {
1513 if (sb_dqopt(sb)->files[i]) 1531 if (sb_dqopt(sb)->files[i])
1514 vfs_quota_off(sb, i, 0); 1532 dquot_quota_off(sb, i);
1515 } 1533 }
1516#endif 1534#endif
1517 sb->s_flags = s_flags; /* Restore MS_RDONLY status */ 1535 sb->s_flags = s_flags; /* Restore MS_RDONLY status */
@@ -1890,21 +1908,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1890 get_random_bytes(&sbi->s_next_generation, sizeof(u32)); 1908 get_random_bytes(&sbi->s_next_generation, sizeof(u32));
1891 spin_lock_init(&sbi->s_next_gen_lock); 1909 spin_lock_init(&sbi->s_next_gen_lock);
1892 1910
1893 err = percpu_counter_init(&sbi->s_freeblocks_counter,
1894 ext3_count_free_blocks(sb));
1895 if (!err) {
1896 err = percpu_counter_init(&sbi->s_freeinodes_counter,
1897 ext3_count_free_inodes(sb));
1898 }
1899 if (!err) {
1900 err = percpu_counter_init(&sbi->s_dirs_counter,
1901 ext3_count_dirs(sb));
1902 }
1903 if (err) {
1904 ext3_msg(sb, KERN_ERR, "error: insufficient memory");
1905 goto failed_mount3;
1906 }
1907
1908 /* per fileystem reservation list head & lock */ 1911 /* per fileystem reservation list head & lock */
1909 spin_lock_init(&sbi->s_rsv_window_lock); 1912 spin_lock_init(&sbi->s_rsv_window_lock);
1910 sbi->s_rsv_window_root = RB_ROOT; 1913 sbi->s_rsv_window_root = RB_ROOT;
@@ -1945,15 +1948,29 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1945 if (!test_opt(sb, NOLOAD) && 1948 if (!test_opt(sb, NOLOAD) &&
1946 EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) { 1949 EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) {
1947 if (ext3_load_journal(sb, es, journal_devnum)) 1950 if (ext3_load_journal(sb, es, journal_devnum))
1948 goto failed_mount3; 1951 goto failed_mount2;
1949 } else if (journal_inum) { 1952 } else if (journal_inum) {
1950 if (ext3_create_journal(sb, es, journal_inum)) 1953 if (ext3_create_journal(sb, es, journal_inum))
1951 goto failed_mount3; 1954 goto failed_mount2;
1952 } else { 1955 } else {
1953 if (!silent) 1956 if (!silent)
1954 ext3_msg(sb, KERN_ERR, 1957 ext3_msg(sb, KERN_ERR,
1955 "error: no journal found. " 1958 "error: no journal found. "
1956 "mounting ext3 over ext2?"); 1959 "mounting ext3 over ext2?");
1960 goto failed_mount2;
1961 }
1962 err = percpu_counter_init(&sbi->s_freeblocks_counter,
1963 ext3_count_free_blocks(sb));
1964 if (!err) {
1965 err = percpu_counter_init(&sbi->s_freeinodes_counter,
1966 ext3_count_free_inodes(sb));
1967 }
1968 if (!err) {
1969 err = percpu_counter_init(&sbi->s_dirs_counter,
1970 ext3_count_dirs(sb));
1971 }
1972 if (err) {
1973 ext3_msg(sb, KERN_ERR, "error: insufficient memory");
1957 goto failed_mount3; 1974 goto failed_mount3;
1958 } 1975 }
1959 1976
@@ -1978,7 +1995,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1978 ext3_msg(sb, KERN_ERR, 1995 ext3_msg(sb, KERN_ERR,
1979 "error: journal does not support " 1996 "error: journal does not support "
1980 "requested data journaling mode"); 1997 "requested data journaling mode");
1981 goto failed_mount4; 1998 goto failed_mount3;
1982 } 1999 }
1983 default: 2000 default:
1984 break; 2001 break;
@@ -2001,19 +2018,19 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
2001 if (IS_ERR(root)) { 2018 if (IS_ERR(root)) {
2002 ext3_msg(sb, KERN_ERR, "error: get root inode failed"); 2019 ext3_msg(sb, KERN_ERR, "error: get root inode failed");
2003 ret = PTR_ERR(root); 2020 ret = PTR_ERR(root);
2004 goto failed_mount4; 2021 goto failed_mount3;
2005 } 2022 }
2006 if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { 2023 if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
2007 iput(root); 2024 iput(root);
2008 ext3_msg(sb, KERN_ERR, "error: corrupt root inode, run e2fsck"); 2025 ext3_msg(sb, KERN_ERR, "error: corrupt root inode, run e2fsck");
2009 goto failed_mount4; 2026 goto failed_mount3;
2010 } 2027 }
2011 sb->s_root = d_alloc_root(root); 2028 sb->s_root = d_alloc_root(root);
2012 if (!sb->s_root) { 2029 if (!sb->s_root) {
2013 ext3_msg(sb, KERN_ERR, "error: get root dentry failed"); 2030 ext3_msg(sb, KERN_ERR, "error: get root dentry failed");
2014 iput(root); 2031 iput(root);
2015 ret = -ENOMEM; 2032 ret = -ENOMEM;
2016 goto failed_mount4; 2033 goto failed_mount3;
2017 } 2034 }
2018 2035
2019 ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY); 2036 ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY);
@@ -2039,12 +2056,11 @@ cantfind_ext3:
2039 sb->s_id); 2056 sb->s_id);
2040 goto failed_mount; 2057 goto failed_mount;
2041 2058
2042failed_mount4:
2043 journal_destroy(sbi->s_journal);
2044failed_mount3: 2059failed_mount3:
2045 percpu_counter_destroy(&sbi->s_freeblocks_counter); 2060 percpu_counter_destroy(&sbi->s_freeblocks_counter);
2046 percpu_counter_destroy(&sbi->s_freeinodes_counter); 2061 percpu_counter_destroy(&sbi->s_freeinodes_counter);
2047 percpu_counter_destroy(&sbi->s_dirs_counter); 2062 percpu_counter_destroy(&sbi->s_dirs_counter);
2063 journal_destroy(sbi->s_journal);
2048failed_mount2: 2064failed_mount2:
2049 for (i = 0; i < db_count; i++) 2065 for (i = 0; i < db_count; i++)
2050 brelse(sbi->s_group_desc[i]); 2066 brelse(sbi->s_group_desc[i]);
@@ -2278,6 +2294,9 @@ static int ext3_load_journal(struct super_block *sb,
2278 return -EINVAL; 2294 return -EINVAL;
2279 } 2295 }
2280 2296
2297 if (!(journal->j_flags & JFS_BARRIER))
2298 printk(KERN_INFO "EXT3-fs: barriers not enabled\n");
2299
2281 if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) { 2300 if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
2282 err = journal_update_format(journal); 2301 err = journal_update_format(journal);
2283 if (err) { 2302 if (err) {
@@ -2534,6 +2553,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
2534 ext3_fsblk_t n_blocks_count = 0; 2553 ext3_fsblk_t n_blocks_count = 0;
2535 unsigned long old_sb_flags; 2554 unsigned long old_sb_flags;
2536 struct ext3_mount_options old_opts; 2555 struct ext3_mount_options old_opts;
2556 int enable_quota = 0;
2537 int err; 2557 int err;
2538#ifdef CONFIG_QUOTA 2558#ifdef CONFIG_QUOTA
2539 int i; 2559 int i;
@@ -2580,6 +2600,10 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
2580 } 2600 }
2581 2601
2582 if (*flags & MS_RDONLY) { 2602 if (*flags & MS_RDONLY) {
2603 err = dquot_suspend(sb, -1);
2604 if (err < 0)
2605 goto restore_opts;
2606
2583 /* 2607 /*
2584 * First of all, the unconditional stuff we have to do 2608 * First of all, the unconditional stuff we have to do
2585 * to disable replay of the journal when we next remount 2609 * to disable replay of the journal when we next remount
@@ -2634,6 +2658,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
2634 goto restore_opts; 2658 goto restore_opts;
2635 if (!ext3_setup_super (sb, es, 0)) 2659 if (!ext3_setup_super (sb, es, 0))
2636 sb->s_flags &= ~MS_RDONLY; 2660 sb->s_flags &= ~MS_RDONLY;
2661 enable_quota = 1;
2637 } 2662 }
2638 } 2663 }
2639#ifdef CONFIG_QUOTA 2664#ifdef CONFIG_QUOTA
@@ -2645,6 +2670,9 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
2645#endif 2670#endif
2646 unlock_super(sb); 2671 unlock_super(sb);
2647 unlock_kernel(); 2672 unlock_kernel();
2673
2674 if (enable_quota)
2675 dquot_resume(sb, -1);
2648 return 0; 2676 return 0;
2649restore_opts: 2677restore_opts:
2650 sb->s_flags = old_sb_flags; 2678 sb->s_flags = old_sb_flags;
@@ -2834,24 +2862,21 @@ static int ext3_write_info(struct super_block *sb, int type)
2834 */ 2862 */
2835static int ext3_quota_on_mount(struct super_block *sb, int type) 2863static int ext3_quota_on_mount(struct super_block *sb, int type)
2836{ 2864{
2837 return vfs_quota_on_mount(sb, EXT3_SB(sb)->s_qf_names[type], 2865 return dquot_quota_on_mount(sb, EXT3_SB(sb)->s_qf_names[type],
2838 EXT3_SB(sb)->s_jquota_fmt, type); 2866 EXT3_SB(sb)->s_jquota_fmt, type);
2839} 2867}
2840 2868
2841/* 2869/*
2842 * Standard function to be called on quota_on 2870 * Standard function to be called on quota_on
2843 */ 2871 */
2844static int ext3_quota_on(struct super_block *sb, int type, int format_id, 2872static int ext3_quota_on(struct super_block *sb, int type, int format_id,
2845 char *name, int remount) 2873 char *name)
2846{ 2874{
2847 int err; 2875 int err;
2848 struct path path; 2876 struct path path;
2849 2877
2850 if (!test_opt(sb, QUOTA)) 2878 if (!test_opt(sb, QUOTA))
2851 return -EINVAL; 2879 return -EINVAL;
2852 /* When remounting, no checks are needed and in fact, name is NULL */
2853 if (remount)
2854 return vfs_quota_on(sb, type, format_id, name, remount);
2855 2880
2856 err = kern_path(name, LOOKUP_FOLLOW, &path); 2881 err = kern_path(name, LOOKUP_FOLLOW, &path);
2857 if (err) 2882 if (err)
@@ -2889,7 +2914,7 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id,
2889 } 2914 }
2890 } 2915 }
2891 2916
2892 err = vfs_quota_on_path(sb, type, format_id, &path); 2917 err = dquot_quota_on_path(sb, type, format_id, &path);
2893 path_put(&path); 2918 path_put(&path);
2894 return err; 2919 return err;
2895} 2920}
diff --git a/fs/ext3/symlink.c b/fs/ext3/symlink.c
index ff7b4ccd8983..7c4898207776 100644
--- a/fs/ext3/symlink.c
+++ b/fs/ext3/symlink.c
@@ -34,6 +34,7 @@ const struct inode_operations ext3_symlink_inode_operations = {
34 .readlink = generic_readlink, 34 .readlink = generic_readlink,
35 .follow_link = page_follow_link_light, 35 .follow_link = page_follow_link_light,
36 .put_link = page_put_link, 36 .put_link = page_put_link,
37 .setattr = ext3_setattr,
37#ifdef CONFIG_EXT3_FS_XATTR 38#ifdef CONFIG_EXT3_FS_XATTR
38 .setxattr = generic_setxattr, 39 .setxattr = generic_setxattr,
39 .getxattr = generic_getxattr, 40 .getxattr = generic_getxattr,
@@ -45,6 +46,7 @@ const struct inode_operations ext3_symlink_inode_operations = {
45const struct inode_operations ext3_fast_symlink_inode_operations = { 46const struct inode_operations ext3_fast_symlink_inode_operations = {
46 .readlink = generic_readlink, 47 .readlink = generic_readlink,
47 .follow_link = ext3_follow_link, 48 .follow_link = ext3_follow_link,
49 .setattr = ext3_setattr,
48#ifdef CONFIG_EXT3_FS_XATTR 50#ifdef CONFIG_EXT3_FS_XATTR
49 .setxattr = generic_setxattr, 51 .setxattr = generic_setxattr,
50 .getxattr = generic_getxattr, 52 .getxattr = generic_getxattr,
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index 534a94c3a933..71fb8d65e54c 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -104,7 +104,7 @@ static int ext3_xattr_list(struct dentry *dentry, char *buffer,
104 104
105static struct mb_cache *ext3_xattr_cache; 105static struct mb_cache *ext3_xattr_cache;
106 106
107static struct xattr_handler *ext3_xattr_handler_map[] = { 107static const struct xattr_handler *ext3_xattr_handler_map[] = {
108 [EXT3_XATTR_INDEX_USER] = &ext3_xattr_user_handler, 108 [EXT3_XATTR_INDEX_USER] = &ext3_xattr_user_handler,
109#ifdef CONFIG_EXT3_FS_POSIX_ACL 109#ifdef CONFIG_EXT3_FS_POSIX_ACL
110 [EXT3_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext3_xattr_acl_access_handler, 110 [EXT3_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext3_xattr_acl_access_handler,
@@ -116,7 +116,7 @@ static struct xattr_handler *ext3_xattr_handler_map[] = {
116#endif 116#endif
117}; 117};
118 118
119struct xattr_handler *ext3_xattr_handlers[] = { 119const struct xattr_handler *ext3_xattr_handlers[] = {
120 &ext3_xattr_user_handler, 120 &ext3_xattr_user_handler,
121 &ext3_xattr_trusted_handler, 121 &ext3_xattr_trusted_handler,
122#ifdef CONFIG_EXT3_FS_POSIX_ACL 122#ifdef CONFIG_EXT3_FS_POSIX_ACL
@@ -129,10 +129,10 @@ struct xattr_handler *ext3_xattr_handlers[] = {
129 NULL 129 NULL
130}; 130};
131 131
132static inline struct xattr_handler * 132static inline const struct xattr_handler *
133ext3_xattr_handler(int name_index) 133ext3_xattr_handler(int name_index)
134{ 134{
135 struct xattr_handler *handler = NULL; 135 const struct xattr_handler *handler = NULL;
136 136
137 if (name_index > 0 && name_index < ARRAY_SIZE(ext3_xattr_handler_map)) 137 if (name_index > 0 && name_index < ARRAY_SIZE(ext3_xattr_handler_map))
138 handler = ext3_xattr_handler_map[name_index]; 138 handler = ext3_xattr_handler_map[name_index];
@@ -338,7 +338,7 @@ ext3_xattr_list_entries(struct dentry *dentry, struct ext3_xattr_entry *entry,
338 size_t rest = buffer_size; 338 size_t rest = buffer_size;
339 339
340 for (; !IS_LAST_ENTRY(entry); entry = EXT3_XATTR_NEXT(entry)) { 340 for (; !IS_LAST_ENTRY(entry); entry = EXT3_XATTR_NEXT(entry)) {
341 struct xattr_handler *handler = 341 const struct xattr_handler *handler =
342 ext3_xattr_handler(entry->e_name_index); 342 ext3_xattr_handler(entry->e_name_index);
343 343
344 if (handler) { 344 if (handler) {
diff --git a/fs/ext3/xattr.h b/fs/ext3/xattr.h
index 148a4dfc82ab..377fe7201169 100644
--- a/fs/ext3/xattr.h
+++ b/fs/ext3/xattr.h
@@ -58,11 +58,11 @@ struct ext3_xattr_entry {
58 58
59# ifdef CONFIG_EXT3_FS_XATTR 59# ifdef CONFIG_EXT3_FS_XATTR
60 60
61extern struct xattr_handler ext3_xattr_user_handler; 61extern const struct xattr_handler ext3_xattr_user_handler;
62extern struct xattr_handler ext3_xattr_trusted_handler; 62extern const struct xattr_handler ext3_xattr_trusted_handler;
63extern struct xattr_handler ext3_xattr_acl_access_handler; 63extern const struct xattr_handler ext3_xattr_acl_access_handler;
64extern struct xattr_handler ext3_xattr_acl_default_handler; 64extern const struct xattr_handler ext3_xattr_acl_default_handler;
65extern struct xattr_handler ext3_xattr_security_handler; 65extern const struct xattr_handler ext3_xattr_security_handler;
66 66
67extern ssize_t ext3_listxattr(struct dentry *, char *, size_t); 67extern ssize_t ext3_listxattr(struct dentry *, char *, size_t);
68 68
@@ -76,7 +76,7 @@ extern void ext3_xattr_put_super(struct super_block *);
76extern int init_ext3_xattr(void); 76extern int init_ext3_xattr(void);
77extern void exit_ext3_xattr(void); 77extern void exit_ext3_xattr(void);
78 78
79extern struct xattr_handler *ext3_xattr_handlers[]; 79extern const struct xattr_handler *ext3_xattr_handlers[];
80 80
81# else /* CONFIG_EXT3_FS_XATTR */ 81# else /* CONFIG_EXT3_FS_XATTR */
82 82
diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c
index 474348788dd9..03a99bfc59f9 100644
--- a/fs/ext3/xattr_security.c
+++ b/fs/ext3/xattr_security.c
@@ -4,6 +4,7 @@
4 */ 4 */
5 5
6#include <linux/module.h> 6#include <linux/module.h>
7#include <linux/slab.h>
7#include <linux/string.h> 8#include <linux/string.h>
8#include <linux/fs.h> 9#include <linux/fs.h>
9#include <linux/ext3_jbd.h> 10#include <linux/ext3_jbd.h>
@@ -68,7 +69,7 @@ ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir)
68 return err; 69 return err;
69} 70}
70 71
71struct xattr_handler ext3_xattr_security_handler = { 72const struct xattr_handler ext3_xattr_security_handler = {
72 .prefix = XATTR_SECURITY_PREFIX, 73 .prefix = XATTR_SECURITY_PREFIX,
73 .list = ext3_xattr_security_list, 74 .list = ext3_xattr_security_list,
74 .get = ext3_xattr_security_get, 75 .get = ext3_xattr_security_get,
diff --git a/fs/ext3/xattr_trusted.c b/fs/ext3/xattr_trusted.c
index e5562845ed96..dc8edda9ffe0 100644
--- a/fs/ext3/xattr_trusted.c
+++ b/fs/ext3/xattr_trusted.c
@@ -51,7 +51,7 @@ ext3_xattr_trusted_set(struct dentry *dentry, const char *name,
51 value, size, flags); 51 value, size, flags);
52} 52}
53 53
54struct xattr_handler ext3_xattr_trusted_handler = { 54const struct xattr_handler ext3_xattr_trusted_handler = {
55 .prefix = XATTR_TRUSTED_PREFIX, 55 .prefix = XATTR_TRUSTED_PREFIX,
56 .list = ext3_xattr_trusted_list, 56 .list = ext3_xattr_trusted_list,
57 .get = ext3_xattr_trusted_get, 57 .get = ext3_xattr_trusted_get,
diff --git a/fs/ext3/xattr_user.c b/fs/ext3/xattr_user.c
index 3bcfe9ee0a68..7a321974d584 100644
--- a/fs/ext3/xattr_user.c
+++ b/fs/ext3/xattr_user.c
@@ -54,7 +54,7 @@ ext3_xattr_user_set(struct dentry *dentry, const char *name,
54 name, value, size, flags); 54 name, value, size, flags);
55} 55}
56 56
57struct xattr_handler ext3_xattr_user_handler = { 57const struct xattr_handler ext3_xattr_user_handler = {
58 .prefix = XATTR_USER_PREFIX, 58 .prefix = XATTR_USER_PREFIX,
59 .list = ext3_xattr_user_list, 59 .list = ext3_xattr_user_list,
60 .get = ext3_xattr_user_get, 60 .get = ext3_xattr_user_get,
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 8a2a29d35a6f..feaf498feaa6 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -454,7 +454,7 @@ release_and_out:
454 return error; 454 return error;
455} 455}
456 456
457struct xattr_handler ext4_xattr_acl_access_handler = { 457const struct xattr_handler ext4_xattr_acl_access_handler = {
458 .prefix = POSIX_ACL_XATTR_ACCESS, 458 .prefix = POSIX_ACL_XATTR_ACCESS,
459 .flags = ACL_TYPE_ACCESS, 459 .flags = ACL_TYPE_ACCESS,
460 .list = ext4_xattr_list_acl_access, 460 .list = ext4_xattr_list_acl_access,
@@ -462,7 +462,7 @@ struct xattr_handler ext4_xattr_acl_access_handler = {
462 .set = ext4_xattr_set_acl, 462 .set = ext4_xattr_set_acl,
463}; 463};
464 464
465struct xattr_handler ext4_xattr_acl_default_handler = { 465const struct xattr_handler ext4_xattr_acl_default_handler = {
466 .prefix = POSIX_ACL_XATTR_DEFAULT, 466 .prefix = POSIX_ACL_XATTR_DEFAULT,
467 .flags = ACL_TYPE_DEFAULT, 467 .flags = ACL_TYPE_DEFAULT,
468 .list = ext4_xattr_list_acl_default, 468 .list = ext4_xattr_list_acl_default,
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index d2f37a5516c7..95b7594c76f9 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -591,14 +591,15 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
591 ret = ext4_mb_new_blocks(handle, &ar, errp); 591 ret = ext4_mb_new_blocks(handle, &ar, errp);
592 if (count) 592 if (count)
593 *count = ar.len; 593 *count = ar.len;
594
595 /* 594 /*
596 * Account for the allocated meta blocks 595 * Account for the allocated meta blocks. We will never
596 * fail EDQUOT for metdata, but we do account for it.
597 */ 597 */
598 if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) { 598 if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) {
599 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 599 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
600 EXT4_I(inode)->i_allocated_meta_blocks += ar.len; 600 EXT4_I(inode)->i_allocated_meta_blocks += ar.len;
601 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 601 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
602 dquot_alloc_block_nofail(inode, ar.len);
602 } 603 }
603 return ret; 604 return ret;
604} 605}
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 983f0e127493..5b6973fbf1bd 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -18,6 +18,7 @@
18#include <linux/pagemap.h> 18#include <linux/pagemap.h>
19#include <linux/blkdev.h> 19#include <linux/blkdev.h>
20#include <linux/mutex.h> 20#include <linux/mutex.h>
21#include <linux/slab.h>
21#include "ext4.h" 22#include "ext4.h"
22 23
23struct ext4_system_zone { 24struct ext4_system_zone {
@@ -71,9 +72,9 @@ static int add_system_zone(struct ext4_sb_info *sbi,
71 else if (start_blk >= (entry->start_blk + entry->count)) 72 else if (start_blk >= (entry->start_blk + entry->count))
72 n = &(*n)->rb_right; 73 n = &(*n)->rb_right;
73 else { 74 else {
74 if (start_blk + count > (entry->start_blk + 75 if (start_blk + count > (entry->start_blk +
75 entry->count)) 76 entry->count))
76 entry->count = (start_blk + count - 77 entry->count = (start_blk + count -
77 entry->start_blk); 78 entry->start_blk);
78 new_node = *n; 79 new_node = *n;
79 new_entry = rb_entry(new_node, struct ext4_system_zone, 80 new_entry = rb_entry(new_node, struct ext4_system_zone,
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 86cb6d86a048..ea5e6cb7e2a5 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -83,11 +83,10 @@ int ext4_check_dir_entry(const char *function, struct inode *dir,
83 error_msg = "inode out of bounds"; 83 error_msg = "inode out of bounds";
84 84
85 if (error_msg != NULL) 85 if (error_msg != NULL)
86 __ext4_error(dir->i_sb, function, 86 ext4_error_inode(function, dir,
87 "bad entry in directory #%lu: %s - block=%llu" 87 "bad entry in directory: %s - block=%llu"
88 "offset=%u(%u), inode=%u, rec_len=%d, name_len=%d", 88 "offset=%u(%u), inode=%u, rec_len=%d, name_len=%d",
89 dir->i_ino, error_msg, 89 error_msg, (unsigned long long) bh->b_blocknr,
90 (unsigned long long) bh->b_blocknr,
91 (unsigned) (offset%bh->b_size), offset, 90 (unsigned) (offset%bh->b_size), offset,
92 le32_to_cpu(de->inode), 91 le32_to_cpu(de->inode),
93 rlen, de->name_len); 92 rlen, de->name_len);
@@ -111,7 +110,7 @@ static int ext4_readdir(struct file *filp,
111 110
112 if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb, 111 if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
113 EXT4_FEATURE_COMPAT_DIR_INDEX) && 112 EXT4_FEATURE_COMPAT_DIR_INDEX) &&
114 ((EXT4_I(inode)->i_flags & EXT4_INDEX_FL) || 113 ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) ||
115 ((inode->i_size >> sb->s_blocksize_bits) == 1))) { 114 ((inode->i_size >> sb->s_blocksize_bits) == 1))) {
116 err = ext4_dx_readdir(filp, dirent, filldir); 115 err = ext4_dx_readdir(filp, dirent, filldir);
117 if (err != ERR_BAD_DX_DIR) { 116 if (err != ERR_BAD_DX_DIR) {
@@ -122,20 +121,20 @@ static int ext4_readdir(struct file *filp,
122 * We don't set the inode dirty flag since it's not 121 * We don't set the inode dirty flag since it's not
123 * critical that it get flushed back to the disk. 122 * critical that it get flushed back to the disk.
124 */ 123 */
125 EXT4_I(filp->f_path.dentry->d_inode)->i_flags &= ~EXT4_INDEX_FL; 124 ext4_clear_inode_flag(filp->f_path.dentry->d_inode, EXT4_INODE_INDEX);
126 } 125 }
127 stored = 0; 126 stored = 0;
128 offset = filp->f_pos & (sb->s_blocksize - 1); 127 offset = filp->f_pos & (sb->s_blocksize - 1);
129 128
130 while (!error && !stored && filp->f_pos < inode->i_size) { 129 while (!error && !stored && filp->f_pos < inode->i_size) {
131 ext4_lblk_t blk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb); 130 struct ext4_map_blocks map;
132 struct buffer_head map_bh;
133 struct buffer_head *bh = NULL; 131 struct buffer_head *bh = NULL;
134 132
135 map_bh.b_state = 0; 133 map.m_lblk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb);
136 err = ext4_get_blocks(NULL, inode, blk, 1, &map_bh, 0); 134 map.m_len = 1;
135 err = ext4_map_blocks(NULL, inode, &map, 0);
137 if (err > 0) { 136 if (err > 0) {
138 pgoff_t index = map_bh.b_blocknr >> 137 pgoff_t index = map.m_pblk >>
139 (PAGE_CACHE_SHIFT - inode->i_blkbits); 138 (PAGE_CACHE_SHIFT - inode->i_blkbits);
140 if (!ra_has_index(&filp->f_ra, index)) 139 if (!ra_has_index(&filp->f_ra, index))
141 page_cache_sync_readahead( 140 page_cache_sync_readahead(
@@ -143,7 +142,7 @@ static int ext4_readdir(struct file *filp,
143 &filp->f_ra, filp, 142 &filp->f_ra, filp,
144 index, 1); 143 index, 1);
145 filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; 144 filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
146 bh = ext4_bread(NULL, inode, blk, 0, &err); 145 bh = ext4_bread(NULL, inode, map.m_lblk, 0, &err);
147 } 146 }
148 147
149 /* 148 /*
@@ -152,9 +151,8 @@ static int ext4_readdir(struct file *filp,
152 */ 151 */
153 if (!bh) { 152 if (!bh) {
154 if (!dir_has_error) { 153 if (!dir_has_error) {
155 ext4_error(sb, "directory #%lu " 154 EXT4_ERROR_INODE(inode, "directory "
156 "contains a hole at offset %Lu", 155 "contains a hole at offset %Lu",
157 inode->i_ino,
158 (unsigned long long) filp->f_pos); 156 (unsigned long long) filp->f_pos);
159 dir_has_error = 1; 157 dir_has_error = 1;
160 } 158 }
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index bf938cf7c5f0..19a4de57128a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -29,6 +29,9 @@
29#include <linux/wait.h> 29#include <linux/wait.h>
30#include <linux/blockgroup_lock.h> 30#include <linux/blockgroup_lock.h>
31#include <linux/percpu_counter.h> 31#include <linux/percpu_counter.h>
32#ifdef __KERNEL__
33#include <linux/compat.h>
34#endif
32 35
33/* 36/*
34 * The fourth extended filesystem constants/structures 37 * The fourth extended filesystem constants/structures
@@ -54,10 +57,10 @@
54#endif 57#endif
55 58
56#define EXT4_ERROR_INODE(inode, fmt, a...) \ 59#define EXT4_ERROR_INODE(inode, fmt, a...) \
57 ext4_error_inode(__func__, (inode), (fmt), ## a); 60 ext4_error_inode(__func__, (inode), (fmt), ## a)
58 61
59#define EXT4_ERROR_FILE(file, fmt, a...) \ 62#define EXT4_ERROR_FILE(file, fmt, a...) \
60 ext4_error_file(__func__, (file), (fmt), ## a); 63 ext4_error_file(__func__, (file), (fmt), ## a)
61 64
62/* data type for block offset of block group */ 65/* data type for block offset of block group */
63typedef int ext4_grpblk_t; 66typedef int ext4_grpblk_t;
@@ -72,7 +75,7 @@ typedef __u32 ext4_lblk_t;
72typedef unsigned int ext4_group_t; 75typedef unsigned int ext4_group_t;
73 76
74/* 77/*
75 * Flags used in mballoc's allocation_context flags field. 78 * Flags used in mballoc's allocation_context flags field.
76 * 79 *
77 * Also used to show what's going on for debugging purposes when the 80 * Also used to show what's going on for debugging purposes when the
78 * flag field is exported via the traceport interface 81 * flag field is exported via the traceport interface
@@ -126,6 +129,29 @@ struct ext4_allocation_request {
126}; 129};
127 130
128/* 131/*
132 * Logical to physical block mapping, used by ext4_map_blocks()
133 *
134 * This structure is used to pass requests into ext4_map_blocks() as
135 * well as to store the information returned by ext4_map_blocks(). It
136 * takes less room on the stack than a struct buffer_head.
137 */
138#define EXT4_MAP_NEW (1 << BH_New)
139#define EXT4_MAP_MAPPED (1 << BH_Mapped)
140#define EXT4_MAP_UNWRITTEN (1 << BH_Unwritten)
141#define EXT4_MAP_BOUNDARY (1 << BH_Boundary)
142#define EXT4_MAP_UNINIT (1 << BH_Uninit)
143#define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\
144 EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\
145 EXT4_MAP_UNINIT)
146
147struct ext4_map_blocks {
148 ext4_fsblk_t m_pblk;
149 ext4_lblk_t m_lblk;
150 unsigned int m_len;
151 unsigned int m_flags;
152};
153
154/*
129 * For delayed allocation tracking 155 * For delayed allocation tracking
130 */ 156 */
131struct mpage_da_data { 157struct mpage_da_data {
@@ -321,6 +347,83 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
321 return flags & EXT4_OTHER_FLMASK; 347 return flags & EXT4_OTHER_FLMASK;
322} 348}
323 349
350/*
351 * Inode flags used for atomic set/get
352 */
353enum {
354 EXT4_INODE_SECRM = 0, /* Secure deletion */
355 EXT4_INODE_UNRM = 1, /* Undelete */
356 EXT4_INODE_COMPR = 2, /* Compress file */
357 EXT4_INODE_SYNC = 3, /* Synchronous updates */
358 EXT4_INODE_IMMUTABLE = 4, /* Immutable file */
359 EXT4_INODE_APPEND = 5, /* writes to file may only append */
360 EXT4_INODE_NODUMP = 6, /* do not dump file */
361 EXT4_INODE_NOATIME = 7, /* do not update atime */
362/* Reserved for compression usage... */
363 EXT4_INODE_DIRTY = 8,
364 EXT4_INODE_COMPRBLK = 9, /* One or more compressed clusters */
365 EXT4_INODE_NOCOMPR = 10, /* Don't compress */
366 EXT4_INODE_ECOMPR = 11, /* Compression error */
367/* End compression flags --- maybe not all used */
368 EXT4_INODE_INDEX = 12, /* hash-indexed directory */
369 EXT4_INODE_IMAGIC = 13, /* AFS directory */
370 EXT4_INODE_JOURNAL_DATA = 14, /* file data should be journaled */
371 EXT4_INODE_NOTAIL = 15, /* file tail should not be merged */
372 EXT4_INODE_DIRSYNC = 16, /* dirsync behaviour (directories only) */
373 EXT4_INODE_TOPDIR = 17, /* Top of directory hierarchies*/
374 EXT4_INODE_HUGE_FILE = 18, /* Set to each huge file */
375 EXT4_INODE_EXTENTS = 19, /* Inode uses extents */
376 EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */
377 EXT4_INODE_EOFBLOCKS = 22, /* Blocks allocated beyond EOF */
378 EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */
379};
380
381#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG))
382#define CHECK_FLAG_VALUE(FLAG) if (!TEST_FLAG_VALUE(FLAG)) { \
383 printk(KERN_EMERG "EXT4 flag fail: " #FLAG ": %d %d\n", \
384 EXT4_##FLAG##_FL, EXT4_INODE_##FLAG); BUG_ON(1); }
385
386/*
387 * Since it's pretty easy to mix up bit numbers and hex values, and we
388 * can't do a compile-time test for ENUM values, we use a run-time
389 * test to make sure that EXT4_XXX_FL is consistent with respect to
390 * EXT4_INODE_XXX. If all is well the printk and BUG_ON will all drop
391 * out so it won't cost any extra space in the compiled kernel image.
392 * But it's important that these values are the same, since we are
393 * using EXT4_INODE_XXX to test for the flag values, but EXT4_XX_FL
394 * must be consistent with the values of FS_XXX_FL defined in
395 * include/linux/fs.h and the on-disk values found in ext2, ext3, and
396 * ext4 filesystems, and of course the values defined in e2fsprogs.
397 *
398 * It's not paranoia if the Murphy's Law really *is* out to get you. :-)
399 */
400static inline void ext4_check_flag_values(void)
401{
402 CHECK_FLAG_VALUE(SECRM);
403 CHECK_FLAG_VALUE(UNRM);
404 CHECK_FLAG_VALUE(COMPR);
405 CHECK_FLAG_VALUE(SYNC);
406 CHECK_FLAG_VALUE(IMMUTABLE);
407 CHECK_FLAG_VALUE(APPEND);
408 CHECK_FLAG_VALUE(NODUMP);
409 CHECK_FLAG_VALUE(NOATIME);
410 CHECK_FLAG_VALUE(DIRTY);
411 CHECK_FLAG_VALUE(COMPRBLK);
412 CHECK_FLAG_VALUE(NOCOMPR);
413 CHECK_FLAG_VALUE(ECOMPR);
414 CHECK_FLAG_VALUE(INDEX);
415 CHECK_FLAG_VALUE(IMAGIC);
416 CHECK_FLAG_VALUE(JOURNAL_DATA);
417 CHECK_FLAG_VALUE(NOTAIL);
418 CHECK_FLAG_VALUE(DIRSYNC);
419 CHECK_FLAG_VALUE(TOPDIR);
420 CHECK_FLAG_VALUE(HUGE_FILE);
421 CHECK_FLAG_VALUE(EXTENTS);
422 CHECK_FLAG_VALUE(EA_INODE);
423 CHECK_FLAG_VALUE(EOFBLOCKS);
424 CHECK_FLAG_VALUE(RESERVED);
425}
426
324/* Used to pass group descriptor data when online resize is done */ 427/* Used to pass group descriptor data when online resize is done */
325struct ext4_new_group_input { 428struct ext4_new_group_input {
326 __u32 group; /* Group number for this data */ 429 __u32 group; /* Group number for this data */
@@ -332,6 +435,18 @@ struct ext4_new_group_input {
332 __u16 unused; 435 __u16 unused;
333}; 436};
334 437
438#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
439struct compat_ext4_new_group_input {
440 u32 group;
441 compat_u64 block_bitmap;
442 compat_u64 inode_bitmap;
443 compat_u64 inode_table;
444 u32 blocks_count;
445 u16 reserved_blocks;
446 u16 unused;
447};
448#endif
449
335/* The struct ext4_new_group_input in kernel space, with free_blocks_count */ 450/* The struct ext4_new_group_input in kernel space, with free_blocks_count */
336struct ext4_new_group_data { 451struct ext4_new_group_data {
337 __u32 group; 452 __u32 group;
@@ -355,7 +470,7 @@ struct ext4_new_group_data {
355#define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT (EXT4_GET_BLOCKS_UNINIT_EXT|\ 470#define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT (EXT4_GET_BLOCKS_UNINIT_EXT|\
356 EXT4_GET_BLOCKS_CREATE) 471 EXT4_GET_BLOCKS_CREATE)
357 /* Caller is from the delayed allocation writeout path, 472 /* Caller is from the delayed allocation writeout path,
358 so set the magic i_delalloc_reserve_flag after taking the 473 so set the magic i_delalloc_reserve_flag after taking the
359 inode allocation semaphore for */ 474 inode allocation semaphore for */
360#define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004 475#define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004
361 /* caller is from the direct IO path, request to creation of an 476 /* caller is from the direct IO path, request to creation of an
@@ -398,6 +513,7 @@ struct ext4_new_group_data {
398#define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12) 513#define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12)
399#define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent) 514#define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent)
400 515
516#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
401/* 517/*
402 * ioctl commands in 32 bit emulation 518 * ioctl commands in 32 bit emulation
403 */ 519 */
@@ -408,11 +524,13 @@ struct ext4_new_group_data {
408#define EXT4_IOC32_GETRSVSZ _IOR('f', 5, int) 524#define EXT4_IOC32_GETRSVSZ _IOR('f', 5, int)
409#define EXT4_IOC32_SETRSVSZ _IOW('f', 6, int) 525#define EXT4_IOC32_SETRSVSZ _IOW('f', 6, int)
410#define EXT4_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int) 526#define EXT4_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int)
527#define EXT4_IOC32_GROUP_ADD _IOW('f', 8, struct compat_ext4_new_group_input)
411#ifdef CONFIG_JBD2_DEBUG 528#ifdef CONFIG_JBD2_DEBUG
412#define EXT4_IOC32_WAIT_FOR_READONLY _IOR('f', 99, int) 529#define EXT4_IOC32_WAIT_FOR_READONLY _IOR('f', 99, int)
413#endif 530#endif
414#define EXT4_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION 531#define EXT4_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION
415#define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION 532#define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION
533#endif
416 534
417 535
418/* 536/*
@@ -616,9 +734,8 @@ struct ext4_ext_cache {
616 */ 734 */
617struct ext4_inode_info { 735struct ext4_inode_info {
618 __le32 i_data[15]; /* unconverted */ 736 __le32 i_data[15]; /* unconverted */
619 __u32 i_flags;
620 ext4_fsblk_t i_file_acl;
621 __u32 i_dtime; 737 __u32 i_dtime;
738 ext4_fsblk_t i_file_acl;
622 739
623 /* 740 /*
624 * i_block_group is the number of the block group which contains 741 * i_block_group is the number of the block group which contains
@@ -629,6 +746,7 @@ struct ext4_inode_info {
629 */ 746 */
630 ext4_group_t i_block_group; 747 ext4_group_t i_block_group;
631 unsigned long i_state_flags; /* Dynamic state flags */ 748 unsigned long i_state_flags; /* Dynamic state flags */
749 unsigned long i_flags;
632 750
633 ext4_lblk_t i_dir_start_lookup; 751 ext4_lblk_t i_dir_start_lookup;
634#ifdef CONFIG_EXT4_FS_XATTR 752#ifdef CONFIG_EXT4_FS_XATTR
@@ -1062,22 +1180,25 @@ enum {
1062 EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */ 1180 EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */
1063 EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */ 1181 EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */
1064 EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/ 1182 EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/
1183 EXT4_STATE_NEWENTRY, /* File just added to dir */
1065}; 1184};
1066 1185
1067static inline int ext4_test_inode_state(struct inode *inode, int bit) 1186#define EXT4_INODE_BIT_FNS(name, field) \
1068{ 1187static inline int ext4_test_inode_##name(struct inode *inode, int bit) \
1069 return test_bit(bit, &EXT4_I(inode)->i_state_flags); 1188{ \
1070} 1189 return test_bit(bit, &EXT4_I(inode)->i_##field); \
1071 1190} \
1072static inline void ext4_set_inode_state(struct inode *inode, int bit) 1191static inline void ext4_set_inode_##name(struct inode *inode, int bit) \
1073{ 1192{ \
1074 set_bit(bit, &EXT4_I(inode)->i_state_flags); 1193 set_bit(bit, &EXT4_I(inode)->i_##field); \
1194} \
1195static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \
1196{ \
1197 clear_bit(bit, &EXT4_I(inode)->i_##field); \
1075} 1198}
1076 1199
1077static inline void ext4_clear_inode_state(struct inode *inode, int bit) 1200EXT4_INODE_BIT_FNS(flag, flags)
1078{ 1201EXT4_INODE_BIT_FNS(state, state_flags)
1079 clear_bit(bit, &EXT4_I(inode)->i_state_flags);
1080}
1081#else 1202#else
1082/* Assume that user mode programs are passing in an ext4fs superblock, not 1203/* Assume that user mode programs are passing in an ext4fs superblock, not
1083 * a kernel struct super_block. This will allow us to call the feature-test 1204 * a kernel struct super_block. This will allow us to call the feature-test
@@ -1264,7 +1385,7 @@ struct ext4_dir_entry_2 {
1264 1385
1265#define is_dx(dir) (EXT4_HAS_COMPAT_FEATURE(dir->i_sb, \ 1386#define is_dx(dir) (EXT4_HAS_COMPAT_FEATURE(dir->i_sb, \
1266 EXT4_FEATURE_COMPAT_DIR_INDEX) && \ 1387 EXT4_FEATURE_COMPAT_DIR_INDEX) && \
1267 (EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) 1388 ext4_test_inode_flag((dir), EXT4_INODE_INDEX))
1268#define EXT4_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT4_LINK_MAX) 1389#define EXT4_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT4_LINK_MAX)
1269#define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1) 1390#define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
1270 1391
@@ -1398,7 +1519,7 @@ extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
1398extern void ext4_htree_free_dir_info(struct dir_private_info *p); 1519extern void ext4_htree_free_dir_info(struct dir_private_info *p);
1399 1520
1400/* fsync.c */ 1521/* fsync.c */
1401extern int ext4_sync_file(struct file *, struct dentry *, int); 1522extern int ext4_sync_file(struct file *, int);
1402 1523
1403/* hash.c */ 1524/* hash.c */
1404extern int ext4fs_dirhash(const char *name, int len, struct 1525extern int ext4fs_dirhash(const char *name, int len, struct
@@ -1678,6 +1799,7 @@ struct ext4_group_info {
1678 ext4_grpblk_t bb_first_free; /* first free block */ 1799 ext4_grpblk_t bb_first_free; /* first free block */
1679 ext4_grpblk_t bb_free; /* total free blocks */ 1800 ext4_grpblk_t bb_free; /* total free blocks */
1680 ext4_grpblk_t bb_fragments; /* nr of freespace fragments */ 1801 ext4_grpblk_t bb_fragments; /* nr of freespace fragments */
1802 ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */
1681 struct list_head bb_prealloc_list; 1803 struct list_head bb_prealloc_list;
1682#ifdef DOUBLE_CHECK 1804#ifdef DOUBLE_CHECK
1683 void *bb_bitmap; 1805 void *bb_bitmap;
@@ -1772,9 +1894,8 @@ extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
1772extern int ext4_ext_writepage_trans_blocks(struct inode *, int); 1894extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
1773extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, 1895extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
1774 int chunk); 1896 int chunk);
1775extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, 1897extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
1776 ext4_lblk_t iblock, unsigned int max_blocks, 1898 struct ext4_map_blocks *map, int flags);
1777 struct buffer_head *bh_result, int flags);
1778extern void ext4_ext_truncate(struct inode *); 1899extern void ext4_ext_truncate(struct inode *);
1779extern void ext4_ext_init(struct super_block *); 1900extern void ext4_ext_init(struct super_block *);
1780extern void ext4_ext_release(struct super_block *); 1901extern void ext4_ext_release(struct super_block *);
@@ -1782,6 +1903,8 @@ extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
1782 loff_t len); 1903 loff_t len);
1783extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, 1904extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
1784 ssize_t len); 1905 ssize_t len);
1906extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
1907 struct ext4_map_blocks *map, int flags);
1785extern int ext4_get_blocks(handle_t *handle, struct inode *inode, 1908extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
1786 sector_t block, unsigned int max_blocks, 1909 sector_t block, unsigned int max_blocks,
1787 struct buffer_head *bh, int flags); 1910 struct buffer_head *bh, int flags);
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index b79ad5126468..dade0c024797 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -273,7 +273,7 @@ static inline int ext4_should_journal_data(struct inode *inode)
273 return 1; 273 return 1;
274 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) 274 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
275 return 1; 275 return 1;
276 if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) 276 if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
277 return 1; 277 return 1;
278 return 0; 278 return 0;
279} 279}
@@ -284,7 +284,7 @@ static inline int ext4_should_order_data(struct inode *inode)
284 return 0; 284 return 0;
285 if (!S_ISREG(inode->i_mode)) 285 if (!S_ISREG(inode->i_mode))
286 return 0; 286 return 0;
287 if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) 287 if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
288 return 0; 288 return 0;
289 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) 289 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
290 return 1; 290 return 1;
@@ -297,7 +297,7 @@ static inline int ext4_should_writeback_data(struct inode *inode)
297 return 0; 297 return 0;
298 if (EXT4_JOURNAL(inode) == NULL) 298 if (EXT4_JOURNAL(inode) == NULL)
299 return 1; 299 return 1;
300 if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) 300 if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
301 return 0; 301 return 0;
302 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) 302 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
303 return 1; 303 return 1;
@@ -321,7 +321,7 @@ static inline int ext4_should_dioread_nolock(struct inode *inode)
321 return 0; 321 return 0;
322 if (!S_ISREG(inode->i_mode)) 322 if (!S_ISREG(inode->i_mode))
323 return 0; 323 return 0;
324 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) 324 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
325 return 0; 325 return 0;
326 if (ext4_should_journal_data(inode)) 326 if (ext4_should_journal_data(inode))
327 return 0; 327 return 0;
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 94c8ee81f5e1..377309c1af65 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -107,11 +107,8 @@ static int ext4_ext_truncate_extend_restart(handle_t *handle,
107 if (err <= 0) 107 if (err <= 0)
108 return err; 108 return err;
109 err = ext4_truncate_restart_trans(handle, inode, needed); 109 err = ext4_truncate_restart_trans(handle, inode, needed);
110 /* 110 if (err == 0)
111 * We have dropped i_data_sem so someone might have cached again 111 err = -EAGAIN;
112 * an extent we are going to truncate.
113 */
114 ext4_ext_invalidate_cache(inode);
115 112
116 return err; 113 return err;
117} 114}
@@ -185,10 +182,10 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
185 if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { 182 if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
186 /* 183 /*
187 * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME 184 * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME
188 * block groups per flexgroup, reserve the first block 185 * block groups per flexgroup, reserve the first block
189 * group for directories and special files. Regular 186 * group for directories and special files. Regular
190 * files will start at the second block group. This 187 * files will start at the second block group. This
191 * tends to speed up directory access and improves 188 * tends to speed up directory access and improves
192 * fsck times. 189 * fsck times.
193 */ 190 */
194 block_group &= ~(flex_size-1); 191 block_group &= ~(flex_size-1);
@@ -439,10 +436,10 @@ static int __ext4_ext_check(const char *function, struct inode *inode,
439 return 0; 436 return 0;
440 437
441corrupted: 438corrupted:
442 __ext4_error(inode->i_sb, function, 439 ext4_error_inode(function, inode,
443 "bad header/extent in inode #%lu: %s - magic %x, " 440 "bad header/extent: %s - magic %x, "
444 "entries %u, max %u(%u), depth %u(%u)", 441 "entries %u, max %u(%u), depth %u(%u)",
445 inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic), 442 error_msg, le16_to_cpu(eh->eh_magic),
446 le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max), 443 le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
447 max, le16_to_cpu(eh->eh_depth), depth); 444 max, le16_to_cpu(eh->eh_depth), depth);
448 445
@@ -1622,9 +1619,7 @@ int ext4_ext_try_to_merge(struct inode *inode,
1622 merge_done = 1; 1619 merge_done = 1;
1623 WARN_ON(eh->eh_entries == 0); 1620 WARN_ON(eh->eh_entries == 0);
1624 if (!eh->eh_entries) 1621 if (!eh->eh_entries)
1625 ext4_error(inode->i_sb, 1622 EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!");
1626 "inode#%lu, eh->eh_entries = 0!",
1627 inode->i_ino);
1628 } 1623 }
1629 1624
1630 return merge_done; 1625 return merge_done;
@@ -2039,7 +2034,7 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
2039 struct ext4_ext_cache *cex; 2034 struct ext4_ext_cache *cex;
2040 int ret = EXT4_EXT_CACHE_NO; 2035 int ret = EXT4_EXT_CACHE_NO;
2041 2036
2042 /* 2037 /*
2043 * We borrow i_block_reservation_lock to protect i_cached_extent 2038 * We borrow i_block_reservation_lock to protect i_cached_extent
2044 */ 2039 */
2045 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 2040 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
@@ -2361,7 +2356,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
2361 int depth = ext_depth(inode); 2356 int depth = ext_depth(inode);
2362 struct ext4_ext_path *path; 2357 struct ext4_ext_path *path;
2363 handle_t *handle; 2358 handle_t *handle;
2364 int i = 0, err = 0; 2359 int i, err;
2365 2360
2366 ext_debug("truncate since %u\n", start); 2361 ext_debug("truncate since %u\n", start);
2367 2362
@@ -2370,23 +2365,26 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
2370 if (IS_ERR(handle)) 2365 if (IS_ERR(handle))
2371 return PTR_ERR(handle); 2366 return PTR_ERR(handle);
2372 2367
2368again:
2373 ext4_ext_invalidate_cache(inode); 2369 ext4_ext_invalidate_cache(inode);
2374 2370
2375 /* 2371 /*
2376 * We start scanning from right side, freeing all the blocks 2372 * We start scanning from right side, freeing all the blocks
2377 * after i_size and walking into the tree depth-wise. 2373 * after i_size and walking into the tree depth-wise.
2378 */ 2374 */
2375 depth = ext_depth(inode);
2379 path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS); 2376 path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS);
2380 if (path == NULL) { 2377 if (path == NULL) {
2381 ext4_journal_stop(handle); 2378 ext4_journal_stop(handle);
2382 return -ENOMEM; 2379 return -ENOMEM;
2383 } 2380 }
2381 path[0].p_depth = depth;
2384 path[0].p_hdr = ext_inode_hdr(inode); 2382 path[0].p_hdr = ext_inode_hdr(inode);
2385 if (ext4_ext_check(inode, path[0].p_hdr, depth)) { 2383 if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
2386 err = -EIO; 2384 err = -EIO;
2387 goto out; 2385 goto out;
2388 } 2386 }
2389 path[0].p_depth = depth; 2387 i = err = 0;
2390 2388
2391 while (i >= 0 && err == 0) { 2389 while (i >= 0 && err == 0) {
2392 if (i == depth) { 2390 if (i == depth) {
@@ -2480,6 +2478,8 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
2480out: 2478out:
2481 ext4_ext_drop_refs(path); 2479 ext4_ext_drop_refs(path);
2482 kfree(path); 2480 kfree(path);
2481 if (err == -EAGAIN)
2482 goto again;
2483 ext4_journal_stop(handle); 2483 ext4_journal_stop(handle);
2484 2484
2485 return err; 2485 return err;
@@ -2544,7 +2544,7 @@ static void bi_complete(struct bio *bio, int error)
2544/* FIXME!! we need to try to merge to left or right after zero-out */ 2544/* FIXME!! we need to try to merge to left or right after zero-out */
2545static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) 2545static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2546{ 2546{
2547 int ret = -EIO; 2547 int ret;
2548 struct bio *bio; 2548 struct bio *bio;
2549 int blkbits, blocksize; 2549 int blkbits, blocksize;
2550 sector_t ee_pblock; 2550 sector_t ee_pblock;
@@ -2568,6 +2568,9 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2568 len = ee_len; 2568 len = ee_len;
2569 2569
2570 bio = bio_alloc(GFP_NOIO, len); 2570 bio = bio_alloc(GFP_NOIO, len);
2571 if (!bio)
2572 return -ENOMEM;
2573
2571 bio->bi_sector = ee_pblock; 2574 bio->bi_sector = ee_pblock;
2572 bio->bi_bdev = inode->i_sb->s_bdev; 2575 bio->bi_bdev = inode->i_sb->s_bdev;
2573 2576
@@ -2595,22 +2598,20 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2595 submit_bio(WRITE, bio); 2598 submit_bio(WRITE, bio);
2596 wait_for_completion(&event); 2599 wait_for_completion(&event);
2597 2600
2598 if (test_bit(BIO_UPTODATE, &bio->bi_flags)) 2601 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
2599 ret = 0; 2602 bio_put(bio);
2600 else { 2603 return -EIO;
2601 ret = -EIO;
2602 break;
2603 } 2604 }
2604 bio_put(bio); 2605 bio_put(bio);
2605 ee_len -= done; 2606 ee_len -= done;
2606 ee_pblock += done << (blkbits - 9); 2607 ee_pblock += done << (blkbits - 9);
2607 } 2608 }
2608 return ret; 2609 return 0;
2609} 2610}
2610 2611
2611#define EXT4_EXT_ZERO_LEN 7 2612#define EXT4_EXT_ZERO_LEN 7
2612/* 2613/*
2613 * This function is called by ext4_ext_get_blocks() if someone tries to write 2614 * This function is called by ext4_ext_map_blocks() if someone tries to write
2614 * to an uninitialized extent. It may result in splitting the uninitialized 2615 * to an uninitialized extent. It may result in splitting the uninitialized
2615 * extent into multiple extents (upto three - one initialized and two 2616 * extent into multiple extents (upto three - one initialized and two
2616 * uninitialized). 2617 * uninitialized).
@@ -2620,39 +2621,55 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2620 * c> Splits in three extents: Somone is writing in middle of the extent 2621 * c> Splits in three extents: Somone is writing in middle of the extent
2621 */ 2622 */
2622static int ext4_ext_convert_to_initialized(handle_t *handle, 2623static int ext4_ext_convert_to_initialized(handle_t *handle,
2623 struct inode *inode, 2624 struct inode *inode,
2624 struct ext4_ext_path *path, 2625 struct ext4_map_blocks *map,
2625 ext4_lblk_t iblock, 2626 struct ext4_ext_path *path)
2626 unsigned int max_blocks)
2627{ 2627{
2628 struct ext4_extent *ex, newex, orig_ex; 2628 struct ext4_extent *ex, newex, orig_ex;
2629 struct ext4_extent *ex1 = NULL; 2629 struct ext4_extent *ex1 = NULL;
2630 struct ext4_extent *ex2 = NULL; 2630 struct ext4_extent *ex2 = NULL;
2631 struct ext4_extent *ex3 = NULL; 2631 struct ext4_extent *ex3 = NULL;
2632 struct ext4_extent_header *eh; 2632 struct ext4_extent_header *eh;
2633 ext4_lblk_t ee_block; 2633 ext4_lblk_t ee_block, eof_block;
2634 unsigned int allocated, ee_len, depth; 2634 unsigned int allocated, ee_len, depth;
2635 ext4_fsblk_t newblock; 2635 ext4_fsblk_t newblock;
2636 int err = 0; 2636 int err = 0;
2637 int ret = 0; 2637 int ret = 0;
2638 int may_zeroout;
2639
2640 ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
2641 "block %llu, max_blocks %u\n", inode->i_ino,
2642 (unsigned long long)map->m_lblk, map->m_len);
2643
2644 eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
2645 inode->i_sb->s_blocksize_bits;
2646 if (eof_block < map->m_lblk + map->m_len)
2647 eof_block = map->m_lblk + map->m_len;
2638 2648
2639 depth = ext_depth(inode); 2649 depth = ext_depth(inode);
2640 eh = path[depth].p_hdr; 2650 eh = path[depth].p_hdr;
2641 ex = path[depth].p_ext; 2651 ex = path[depth].p_ext;
2642 ee_block = le32_to_cpu(ex->ee_block); 2652 ee_block = le32_to_cpu(ex->ee_block);
2643 ee_len = ext4_ext_get_actual_len(ex); 2653 ee_len = ext4_ext_get_actual_len(ex);
2644 allocated = ee_len - (iblock - ee_block); 2654 allocated = ee_len - (map->m_lblk - ee_block);
2645 newblock = iblock - ee_block + ext_pblock(ex); 2655 newblock = map->m_lblk - ee_block + ext_pblock(ex);
2656
2646 ex2 = ex; 2657 ex2 = ex;
2647 orig_ex.ee_block = ex->ee_block; 2658 orig_ex.ee_block = ex->ee_block;
2648 orig_ex.ee_len = cpu_to_le16(ee_len); 2659 orig_ex.ee_len = cpu_to_le16(ee_len);
2649 ext4_ext_store_pblock(&orig_ex, ext_pblock(ex)); 2660 ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
2650 2661
2662 /*
2663 * It is safe to convert extent to initialized via explicit
2664 * zeroout only if extent is fully insde i_size or new_size.
2665 */
2666 may_zeroout = ee_block + ee_len <= eof_block;
2667
2651 err = ext4_ext_get_access(handle, inode, path + depth); 2668 err = ext4_ext_get_access(handle, inode, path + depth);
2652 if (err) 2669 if (err)
2653 goto out; 2670 goto out;
2654 /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */ 2671 /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
2655 if (ee_len <= 2*EXT4_EXT_ZERO_LEN) { 2672 if (ee_len <= 2*EXT4_EXT_ZERO_LEN && may_zeroout) {
2656 err = ext4_ext_zeroout(inode, &orig_ex); 2673 err = ext4_ext_zeroout(inode, &orig_ex);
2657 if (err) 2674 if (err)
2658 goto fix_extent_len; 2675 goto fix_extent_len;
@@ -2665,10 +2682,10 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2665 return allocated; 2682 return allocated;
2666 } 2683 }
2667 2684
2668 /* ex1: ee_block to iblock - 1 : uninitialized */ 2685 /* ex1: ee_block to map->m_lblk - 1 : uninitialized */
2669 if (iblock > ee_block) { 2686 if (map->m_lblk > ee_block) {
2670 ex1 = ex; 2687 ex1 = ex;
2671 ex1->ee_len = cpu_to_le16(iblock - ee_block); 2688 ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
2672 ext4_ext_mark_uninitialized(ex1); 2689 ext4_ext_mark_uninitialized(ex1);
2673 ex2 = &newex; 2690 ex2 = &newex;
2674 } 2691 }
@@ -2677,15 +2694,15 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2677 * we insert ex3, if ex1 is NULL. This is to avoid temporary 2694 * we insert ex3, if ex1 is NULL. This is to avoid temporary
2678 * overlap of blocks. 2695 * overlap of blocks.
2679 */ 2696 */
2680 if (!ex1 && allocated > max_blocks) 2697 if (!ex1 && allocated > map->m_len)
2681 ex2->ee_len = cpu_to_le16(max_blocks); 2698 ex2->ee_len = cpu_to_le16(map->m_len);
2682 /* ex3: to ee_block + ee_len : uninitialised */ 2699 /* ex3: to ee_block + ee_len : uninitialised */
2683 if (allocated > max_blocks) { 2700 if (allocated > map->m_len) {
2684 unsigned int newdepth; 2701 unsigned int newdepth;
2685 /* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */ 2702 /* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */
2686 if (allocated <= EXT4_EXT_ZERO_LEN) { 2703 if (allocated <= EXT4_EXT_ZERO_LEN && may_zeroout) {
2687 /* 2704 /*
2688 * iblock == ee_block is handled by the zerouout 2705 * map->m_lblk == ee_block is handled by the zerouout
2689 * at the beginning. 2706 * at the beginning.
2690 * Mark first half uninitialized. 2707 * Mark first half uninitialized.
2691 * Mark second half initialized and zero out the 2708 * Mark second half initialized and zero out the
@@ -2698,7 +2715,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2698 ext4_ext_dirty(handle, inode, path + depth); 2715 ext4_ext_dirty(handle, inode, path + depth);
2699 2716
2700 ex3 = &newex; 2717 ex3 = &newex;
2701 ex3->ee_block = cpu_to_le32(iblock); 2718 ex3->ee_block = cpu_to_le32(map->m_lblk);
2702 ext4_ext_store_pblock(ex3, newblock); 2719 ext4_ext_store_pblock(ex3, newblock);
2703 ex3->ee_len = cpu_to_le16(allocated); 2720 ex3->ee_len = cpu_to_le16(allocated);
2704 err = ext4_ext_insert_extent(handle, inode, path, 2721 err = ext4_ext_insert_extent(handle, inode, path,
@@ -2711,7 +2728,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2711 ex->ee_len = orig_ex.ee_len; 2728 ex->ee_len = orig_ex.ee_len;
2712 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 2729 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2713 ext4_ext_dirty(handle, inode, path + depth); 2730 ext4_ext_dirty(handle, inode, path + depth);
2714 /* blocks available from iblock */ 2731 /* blocks available from map->m_lblk */
2715 return allocated; 2732 return allocated;
2716 2733
2717 } else if (err) 2734 } else if (err)
@@ -2733,8 +2750,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2733 */ 2750 */
2734 depth = ext_depth(inode); 2751 depth = ext_depth(inode);
2735 ext4_ext_drop_refs(path); 2752 ext4_ext_drop_refs(path);
2736 path = ext4_ext_find_extent(inode, 2753 path = ext4_ext_find_extent(inode, map->m_lblk,
2737 iblock, path); 2754 path);
2738 if (IS_ERR(path)) { 2755 if (IS_ERR(path)) {
2739 err = PTR_ERR(path); 2756 err = PTR_ERR(path);
2740 return err; 2757 return err;
@@ -2754,12 +2771,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2754 return allocated; 2771 return allocated;
2755 } 2772 }
2756 ex3 = &newex; 2773 ex3 = &newex;
2757 ex3->ee_block = cpu_to_le32(iblock + max_blocks); 2774 ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
2758 ext4_ext_store_pblock(ex3, newblock + max_blocks); 2775 ext4_ext_store_pblock(ex3, newblock + map->m_len);
2759 ex3->ee_len = cpu_to_le16(allocated - max_blocks); 2776 ex3->ee_len = cpu_to_le16(allocated - map->m_len);
2760 ext4_ext_mark_uninitialized(ex3); 2777 ext4_ext_mark_uninitialized(ex3);
2761 err = ext4_ext_insert_extent(handle, inode, path, ex3, 0); 2778 err = ext4_ext_insert_extent(handle, inode, path, ex3, 0);
2762 if (err == -ENOSPC) { 2779 if (err == -ENOSPC && may_zeroout) {
2763 err = ext4_ext_zeroout(inode, &orig_ex); 2780 err = ext4_ext_zeroout(inode, &orig_ex);
2764 if (err) 2781 if (err)
2765 goto fix_extent_len; 2782 goto fix_extent_len;
@@ -2769,7 +2786,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2769 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 2786 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2770 ext4_ext_dirty(handle, inode, path + depth); 2787 ext4_ext_dirty(handle, inode, path + depth);
2771 /* zeroed the full extent */ 2788 /* zeroed the full extent */
2772 /* blocks available from iblock */ 2789 /* blocks available from map->m_lblk */
2773 return allocated; 2790 return allocated;
2774 2791
2775 } else if (err) 2792 } else if (err)
@@ -2783,11 +2800,13 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2783 * update the extent length after successful insert of the 2800 * update the extent length after successful insert of the
2784 * split extent 2801 * split extent
2785 */ 2802 */
2786 orig_ex.ee_len = cpu_to_le16(ee_len - 2803 ee_len -= ext4_ext_get_actual_len(ex3);
2787 ext4_ext_get_actual_len(ex3)); 2804 orig_ex.ee_len = cpu_to_le16(ee_len);
2805 may_zeroout = ee_block + ee_len <= eof_block;
2806
2788 depth = newdepth; 2807 depth = newdepth;
2789 ext4_ext_drop_refs(path); 2808 ext4_ext_drop_refs(path);
2790 path = ext4_ext_find_extent(inode, iblock, path); 2809 path = ext4_ext_find_extent(inode, map->m_lblk, path);
2791 if (IS_ERR(path)) { 2810 if (IS_ERR(path)) {
2792 err = PTR_ERR(path); 2811 err = PTR_ERR(path);
2793 goto out; 2812 goto out;
@@ -2801,14 +2820,14 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2801 if (err) 2820 if (err)
2802 goto out; 2821 goto out;
2803 2822
2804 allocated = max_blocks; 2823 allocated = map->m_len;
2805 2824
2806 /* If extent has less than EXT4_EXT_ZERO_LEN and we are trying 2825 /* If extent has less than EXT4_EXT_ZERO_LEN and we are trying
2807 * to insert a extent in the middle zerout directly 2826 * to insert a extent in the middle zerout directly
2808 * otherwise give the extent a chance to merge to left 2827 * otherwise give the extent a chance to merge to left
2809 */ 2828 */
2810 if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN && 2829 if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN &&
2811 iblock != ee_block) { 2830 map->m_lblk != ee_block && may_zeroout) {
2812 err = ext4_ext_zeroout(inode, &orig_ex); 2831 err = ext4_ext_zeroout(inode, &orig_ex);
2813 if (err) 2832 if (err)
2814 goto fix_extent_len; 2833 goto fix_extent_len;
@@ -2818,7 +2837,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2818 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 2837 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2819 ext4_ext_dirty(handle, inode, path + depth); 2838 ext4_ext_dirty(handle, inode, path + depth);
2820 /* zero out the first half */ 2839 /* zero out the first half */
2821 /* blocks available from iblock */ 2840 /* blocks available from map->m_lblk */
2822 return allocated; 2841 return allocated;
2823 } 2842 }
2824 } 2843 }
@@ -2829,12 +2848,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2829 */ 2848 */
2830 if (ex1 && ex1 != ex) { 2849 if (ex1 && ex1 != ex) {
2831 ex1 = ex; 2850 ex1 = ex;
2832 ex1->ee_len = cpu_to_le16(iblock - ee_block); 2851 ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
2833 ext4_ext_mark_uninitialized(ex1); 2852 ext4_ext_mark_uninitialized(ex1);
2834 ex2 = &newex; 2853 ex2 = &newex;
2835 } 2854 }
2836 /* ex2: iblock to iblock + maxblocks-1 : initialised */ 2855 /* ex2: map->m_lblk to map->m_lblk + maxblocks-1 : initialised */
2837 ex2->ee_block = cpu_to_le32(iblock); 2856 ex2->ee_block = cpu_to_le32(map->m_lblk);
2838 ext4_ext_store_pblock(ex2, newblock); 2857 ext4_ext_store_pblock(ex2, newblock);
2839 ex2->ee_len = cpu_to_le16(allocated); 2858 ex2->ee_len = cpu_to_le16(allocated);
2840 if (ex2 != ex) 2859 if (ex2 != ex)
@@ -2877,7 +2896,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2877 goto out; 2896 goto out;
2878insert: 2897insert:
2879 err = ext4_ext_insert_extent(handle, inode, path, &newex, 0); 2898 err = ext4_ext_insert_extent(handle, inode, path, &newex, 0);
2880 if (err == -ENOSPC) { 2899 if (err == -ENOSPC && may_zeroout) {
2881 err = ext4_ext_zeroout(inode, &orig_ex); 2900 err = ext4_ext_zeroout(inode, &orig_ex);
2882 if (err) 2901 if (err)
2883 goto fix_extent_len; 2902 goto fix_extent_len;
@@ -2904,7 +2923,7 @@ fix_extent_len:
2904} 2923}
2905 2924
2906/* 2925/*
2907 * This function is called by ext4_ext_get_blocks() from 2926 * This function is called by ext4_ext_map_blocks() from
2908 * ext4_get_blocks_dio_write() when DIO to write 2927 * ext4_get_blocks_dio_write() when DIO to write
2909 * to an uninitialized extent. 2928 * to an uninitialized extent.
2910 * 2929 *
@@ -2927,9 +2946,8 @@ fix_extent_len:
2927 */ 2946 */
2928static int ext4_split_unwritten_extents(handle_t *handle, 2947static int ext4_split_unwritten_extents(handle_t *handle,
2929 struct inode *inode, 2948 struct inode *inode,
2949 struct ext4_map_blocks *map,
2930 struct ext4_ext_path *path, 2950 struct ext4_ext_path *path,
2931 ext4_lblk_t iblock,
2932 unsigned int max_blocks,
2933 int flags) 2951 int flags)
2934{ 2952{
2935 struct ext4_extent *ex, newex, orig_ex; 2953 struct ext4_extent *ex, newex, orig_ex;
@@ -2937,41 +2955,55 @@ static int ext4_split_unwritten_extents(handle_t *handle,
2937 struct ext4_extent *ex2 = NULL; 2955 struct ext4_extent *ex2 = NULL;
2938 struct ext4_extent *ex3 = NULL; 2956 struct ext4_extent *ex3 = NULL;
2939 struct ext4_extent_header *eh; 2957 struct ext4_extent_header *eh;
2940 ext4_lblk_t ee_block; 2958 ext4_lblk_t ee_block, eof_block;
2941 unsigned int allocated, ee_len, depth; 2959 unsigned int allocated, ee_len, depth;
2942 ext4_fsblk_t newblock; 2960 ext4_fsblk_t newblock;
2943 int err = 0; 2961 int err = 0;
2962 int may_zeroout;
2963
2964 ext_debug("ext4_split_unwritten_extents: inode %lu, logical"
2965 "block %llu, max_blocks %u\n", inode->i_ino,
2966 (unsigned long long)map->m_lblk, map->m_len);
2967
2968 eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
2969 inode->i_sb->s_blocksize_bits;
2970 if (eof_block < map->m_lblk + map->m_len)
2971 eof_block = map->m_lblk + map->m_len;
2944 2972
2945 ext_debug("ext4_split_unwritten_extents: inode %lu,"
2946 "iblock %llu, max_blocks %u\n", inode->i_ino,
2947 (unsigned long long)iblock, max_blocks);
2948 depth = ext_depth(inode); 2973 depth = ext_depth(inode);
2949 eh = path[depth].p_hdr; 2974 eh = path[depth].p_hdr;
2950 ex = path[depth].p_ext; 2975 ex = path[depth].p_ext;
2951 ee_block = le32_to_cpu(ex->ee_block); 2976 ee_block = le32_to_cpu(ex->ee_block);
2952 ee_len = ext4_ext_get_actual_len(ex); 2977 ee_len = ext4_ext_get_actual_len(ex);
2953 allocated = ee_len - (iblock - ee_block); 2978 allocated = ee_len - (map->m_lblk - ee_block);
2954 newblock = iblock - ee_block + ext_pblock(ex); 2979 newblock = map->m_lblk - ee_block + ext_pblock(ex);
2980
2955 ex2 = ex; 2981 ex2 = ex;
2956 orig_ex.ee_block = ex->ee_block; 2982 orig_ex.ee_block = ex->ee_block;
2957 orig_ex.ee_len = cpu_to_le16(ee_len); 2983 orig_ex.ee_len = cpu_to_le16(ee_len);
2958 ext4_ext_store_pblock(&orig_ex, ext_pblock(ex)); 2984 ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
2959 2985
2960 /* 2986 /*
2987 * It is safe to convert extent to initialized via explicit
2988 * zeroout only if extent is fully insde i_size or new_size.
2989 */
2990 may_zeroout = ee_block + ee_len <= eof_block;
2991
2992 /*
2961 * If the uninitialized extent begins at the same logical 2993 * If the uninitialized extent begins at the same logical
2962 * block where the write begins, and the write completely 2994 * block where the write begins, and the write completely
2963 * covers the extent, then we don't need to split it. 2995 * covers the extent, then we don't need to split it.
2964 */ 2996 */
2965 if ((iblock == ee_block) && (allocated <= max_blocks)) 2997 if ((map->m_lblk == ee_block) && (allocated <= map->m_len))
2966 return allocated; 2998 return allocated;
2967 2999
2968 err = ext4_ext_get_access(handle, inode, path + depth); 3000 err = ext4_ext_get_access(handle, inode, path + depth);
2969 if (err) 3001 if (err)
2970 goto out; 3002 goto out;
2971 /* ex1: ee_block to iblock - 1 : uninitialized */ 3003 /* ex1: ee_block to map->m_lblk - 1 : uninitialized */
2972 if (iblock > ee_block) { 3004 if (map->m_lblk > ee_block) {
2973 ex1 = ex; 3005 ex1 = ex;
2974 ex1->ee_len = cpu_to_le16(iblock - ee_block); 3006 ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
2975 ext4_ext_mark_uninitialized(ex1); 3007 ext4_ext_mark_uninitialized(ex1);
2976 ex2 = &newex; 3008 ex2 = &newex;
2977 } 3009 }
@@ -2980,18 +3012,18 @@ static int ext4_split_unwritten_extents(handle_t *handle,
2980 * we insert ex3, if ex1 is NULL. This is to avoid temporary 3012 * we insert ex3, if ex1 is NULL. This is to avoid temporary
2981 * overlap of blocks. 3013 * overlap of blocks.
2982 */ 3014 */
2983 if (!ex1 && allocated > max_blocks) 3015 if (!ex1 && allocated > map->m_len)
2984 ex2->ee_len = cpu_to_le16(max_blocks); 3016 ex2->ee_len = cpu_to_le16(map->m_len);
2985 /* ex3: to ee_block + ee_len : uninitialised */ 3017 /* ex3: to ee_block + ee_len : uninitialised */
2986 if (allocated > max_blocks) { 3018 if (allocated > map->m_len) {
2987 unsigned int newdepth; 3019 unsigned int newdepth;
2988 ex3 = &newex; 3020 ex3 = &newex;
2989 ex3->ee_block = cpu_to_le32(iblock + max_blocks); 3021 ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
2990 ext4_ext_store_pblock(ex3, newblock + max_blocks); 3022 ext4_ext_store_pblock(ex3, newblock + map->m_len);
2991 ex3->ee_len = cpu_to_le16(allocated - max_blocks); 3023 ex3->ee_len = cpu_to_le16(allocated - map->m_len);
2992 ext4_ext_mark_uninitialized(ex3); 3024 ext4_ext_mark_uninitialized(ex3);
2993 err = ext4_ext_insert_extent(handle, inode, path, ex3, flags); 3025 err = ext4_ext_insert_extent(handle, inode, path, ex3, flags);
2994 if (err == -ENOSPC) { 3026 if (err == -ENOSPC && may_zeroout) {
2995 err = ext4_ext_zeroout(inode, &orig_ex); 3027 err = ext4_ext_zeroout(inode, &orig_ex);
2996 if (err) 3028 if (err)
2997 goto fix_extent_len; 3029 goto fix_extent_len;
@@ -3001,7 +3033,7 @@ static int ext4_split_unwritten_extents(handle_t *handle,
3001 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 3033 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
3002 ext4_ext_dirty(handle, inode, path + depth); 3034 ext4_ext_dirty(handle, inode, path + depth);
3003 /* zeroed the full extent */ 3035 /* zeroed the full extent */
3004 /* blocks available from iblock */ 3036 /* blocks available from map->m_lblk */
3005 return allocated; 3037 return allocated;
3006 3038
3007 } else if (err) 3039 } else if (err)
@@ -3015,11 +3047,13 @@ static int ext4_split_unwritten_extents(handle_t *handle,
3015 * update the extent length after successful insert of the 3047 * update the extent length after successful insert of the
3016 * split extent 3048 * split extent
3017 */ 3049 */
3018 orig_ex.ee_len = cpu_to_le16(ee_len - 3050 ee_len -= ext4_ext_get_actual_len(ex3);
3019 ext4_ext_get_actual_len(ex3)); 3051 orig_ex.ee_len = cpu_to_le16(ee_len);
3052 may_zeroout = ee_block + ee_len <= eof_block;
3053
3020 depth = newdepth; 3054 depth = newdepth;
3021 ext4_ext_drop_refs(path); 3055 ext4_ext_drop_refs(path);
3022 path = ext4_ext_find_extent(inode, iblock, path); 3056 path = ext4_ext_find_extent(inode, map->m_lblk, path);
3023 if (IS_ERR(path)) { 3057 if (IS_ERR(path)) {
3024 err = PTR_ERR(path); 3058 err = PTR_ERR(path);
3025 goto out; 3059 goto out;
@@ -3033,7 +3067,7 @@ static int ext4_split_unwritten_extents(handle_t *handle,
3033 if (err) 3067 if (err)
3034 goto out; 3068 goto out;
3035 3069
3036 allocated = max_blocks; 3070 allocated = map->m_len;
3037 } 3071 }
3038 /* 3072 /*
3039 * If there was a change of depth as part of the 3073 * If there was a change of depth as part of the
@@ -3042,15 +3076,15 @@ static int ext4_split_unwritten_extents(handle_t *handle,
3042 */ 3076 */
3043 if (ex1 && ex1 != ex) { 3077 if (ex1 && ex1 != ex) {
3044 ex1 = ex; 3078 ex1 = ex;
3045 ex1->ee_len = cpu_to_le16(iblock - ee_block); 3079 ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
3046 ext4_ext_mark_uninitialized(ex1); 3080 ext4_ext_mark_uninitialized(ex1);
3047 ex2 = &newex; 3081 ex2 = &newex;
3048 } 3082 }
3049 /* 3083 /*
3050 * ex2: iblock to iblock + maxblocks-1 : to be direct IO written, 3084 * ex2: map->m_lblk to map->m_lblk + map->m_len-1 : to be written
3051 * uninitialised still. 3085 * using direct I/O, uninitialised still.
3052 */ 3086 */
3053 ex2->ee_block = cpu_to_le32(iblock); 3087 ex2->ee_block = cpu_to_le32(map->m_lblk);
3054 ext4_ext_store_pblock(ex2, newblock); 3088 ext4_ext_store_pblock(ex2, newblock);
3055 ex2->ee_len = cpu_to_le16(allocated); 3089 ex2->ee_len = cpu_to_le16(allocated);
3056 ext4_ext_mark_uninitialized(ex2); 3090 ext4_ext_mark_uninitialized(ex2);
@@ -3062,7 +3096,7 @@ static int ext4_split_unwritten_extents(handle_t *handle,
3062 goto out; 3096 goto out;
3063insert: 3097insert:
3064 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); 3098 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
3065 if (err == -ENOSPC) { 3099 if (err == -ENOSPC && may_zeroout) {
3066 err = ext4_ext_zeroout(inode, &orig_ex); 3100 err = ext4_ext_zeroout(inode, &orig_ex);
3067 if (err) 3101 if (err)
3068 goto fix_extent_len; 3102 goto fix_extent_len;
@@ -3152,10 +3186,9 @@ static void unmap_underlying_metadata_blocks(struct block_device *bdev,
3152 3186
3153static int 3187static int
3154ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, 3188ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3155 ext4_lblk_t iblock, unsigned int max_blocks, 3189 struct ext4_map_blocks *map,
3156 struct ext4_ext_path *path, int flags, 3190 struct ext4_ext_path *path, int flags,
3157 unsigned int allocated, struct buffer_head *bh_result, 3191 unsigned int allocated, ext4_fsblk_t newblock)
3158 ext4_fsblk_t newblock)
3159{ 3192{
3160 int ret = 0; 3193 int ret = 0;
3161 int err = 0; 3194 int err = 0;
@@ -3163,15 +3196,14 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3163 3196
3164 ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical" 3197 ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical"
3165 "block %llu, max_blocks %u, flags %d, allocated %u", 3198 "block %llu, max_blocks %u, flags %d, allocated %u",
3166 inode->i_ino, (unsigned long long)iblock, max_blocks, 3199 inode->i_ino, (unsigned long long)map->m_lblk, map->m_len,
3167 flags, allocated); 3200 flags, allocated);
3168 ext4_ext_show_leaf(inode, path); 3201 ext4_ext_show_leaf(inode, path);
3169 3202
3170 /* get_block() before submit the IO, split the extent */ 3203 /* get_block() before submit the IO, split the extent */
3171 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { 3204 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
3172 ret = ext4_split_unwritten_extents(handle, 3205 ret = ext4_split_unwritten_extents(handle, inode, map,
3173 inode, path, iblock, 3206 path, flags);
3174 max_blocks, flags);
3175 /* 3207 /*
3176 * Flag the inode(non aio case) or end_io struct (aio case) 3208 * Flag the inode(non aio case) or end_io struct (aio case)
3177 * that this IO needs to convertion to written when IO is 3209 * that this IO needs to convertion to written when IO is
@@ -3182,7 +3214,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3182 else 3214 else
3183 ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); 3215 ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
3184 if (ext4_should_dioread_nolock(inode)) 3216 if (ext4_should_dioread_nolock(inode))
3185 set_buffer_uninit(bh_result); 3217 map->m_flags |= EXT4_MAP_UNINIT;
3186 goto out; 3218 goto out;
3187 } 3219 }
3188 /* IO end_io complete, convert the filled extent to written */ 3220 /* IO end_io complete, convert the filled extent to written */
@@ -3210,14 +3242,12 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3210 * the buffer head will be unmapped so that 3242 * the buffer head will be unmapped so that
3211 * a read from the block returns 0s. 3243 * a read from the block returns 0s.
3212 */ 3244 */
3213 set_buffer_unwritten(bh_result); 3245 map->m_flags |= EXT4_MAP_UNWRITTEN;
3214 goto out1; 3246 goto out1;
3215 } 3247 }
3216 3248
3217 /* buffered write, writepage time, convert*/ 3249 /* buffered write, writepage time, convert*/
3218 ret = ext4_ext_convert_to_initialized(handle, inode, 3250 ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
3219 path, iblock,
3220 max_blocks);
3221 if (ret >= 0) 3251 if (ret >= 0)
3222 ext4_update_inode_fsync_trans(handle, inode, 1); 3252 ext4_update_inode_fsync_trans(handle, inode, 1);
3223out: 3253out:
@@ -3226,7 +3256,7 @@ out:
3226 goto out2; 3256 goto out2;
3227 } else 3257 } else
3228 allocated = ret; 3258 allocated = ret;
3229 set_buffer_new(bh_result); 3259 map->m_flags |= EXT4_MAP_NEW;
3230 /* 3260 /*
3231 * if we allocated more blocks than requested 3261 * if we allocated more blocks than requested
3232 * we need to make sure we unmap the extra block 3262 * we need to make sure we unmap the extra block
@@ -3234,11 +3264,11 @@ out:
3234 * unmapped later when we find the buffer_head marked 3264 * unmapped later when we find the buffer_head marked
3235 * new. 3265 * new.
3236 */ 3266 */
3237 if (allocated > max_blocks) { 3267 if (allocated > map->m_len) {
3238 unmap_underlying_metadata_blocks(inode->i_sb->s_bdev, 3268 unmap_underlying_metadata_blocks(inode->i_sb->s_bdev,
3239 newblock + max_blocks, 3269 newblock + map->m_len,
3240 allocated - max_blocks); 3270 allocated - map->m_len);
3241 allocated = max_blocks; 3271 allocated = map->m_len;
3242 } 3272 }
3243 3273
3244 /* 3274 /*
@@ -3252,13 +3282,13 @@ out:
3252 ext4_da_update_reserve_space(inode, allocated, 0); 3282 ext4_da_update_reserve_space(inode, allocated, 0);
3253 3283
3254map_out: 3284map_out:
3255 set_buffer_mapped(bh_result); 3285 map->m_flags |= EXT4_MAP_MAPPED;
3256out1: 3286out1:
3257 if (allocated > max_blocks) 3287 if (allocated > map->m_len)
3258 allocated = max_blocks; 3288 allocated = map->m_len;
3259 ext4_ext_show_leaf(inode, path); 3289 ext4_ext_show_leaf(inode, path);
3260 bh_result->b_bdev = inode->i_sb->s_bdev; 3290 map->m_pblk = newblock;
3261 bh_result->b_blocknr = newblock; 3291 map->m_len = allocated;
3262out2: 3292out2:
3263 if (path) { 3293 if (path) {
3264 ext4_ext_drop_refs(path); 3294 ext4_ext_drop_refs(path);
@@ -3284,26 +3314,23 @@ out2:
3284 * 3314 *
3285 * return < 0, error case. 3315 * return < 0, error case.
3286 */ 3316 */
3287int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, 3317int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3288 ext4_lblk_t iblock, 3318 struct ext4_map_blocks *map, int flags)
3289 unsigned int max_blocks, struct buffer_head *bh_result,
3290 int flags)
3291{ 3319{
3292 struct ext4_ext_path *path = NULL; 3320 struct ext4_ext_path *path = NULL;
3293 struct ext4_extent_header *eh; 3321 struct ext4_extent_header *eh;
3294 struct ext4_extent newex, *ex, *last_ex; 3322 struct ext4_extent newex, *ex, *last_ex;
3295 ext4_fsblk_t newblock; 3323 ext4_fsblk_t newblock;
3296 int err = 0, depth, ret, cache_type; 3324 int i, err = 0, depth, ret, cache_type;
3297 unsigned int allocated = 0; 3325 unsigned int allocated = 0;
3298 struct ext4_allocation_request ar; 3326 struct ext4_allocation_request ar;
3299 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; 3327 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
3300 3328
3301 __clear_bit(BH_New, &bh_result->b_state);
3302 ext_debug("blocks %u/%u requested for inode %lu\n", 3329 ext_debug("blocks %u/%u requested for inode %lu\n",
3303 iblock, max_blocks, inode->i_ino); 3330 map->m_lblk, map->m_len, inode->i_ino);
3304 3331
3305 /* check in cache */ 3332 /* check in cache */
3306 cache_type = ext4_ext_in_cache(inode, iblock, &newex); 3333 cache_type = ext4_ext_in_cache(inode, map->m_lblk, &newex);
3307 if (cache_type) { 3334 if (cache_type) {
3308 if (cache_type == EXT4_EXT_CACHE_GAP) { 3335 if (cache_type == EXT4_EXT_CACHE_GAP) {
3309 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { 3336 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
@@ -3316,12 +3343,12 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3316 /* we should allocate requested block */ 3343 /* we should allocate requested block */
3317 } else if (cache_type == EXT4_EXT_CACHE_EXTENT) { 3344 } else if (cache_type == EXT4_EXT_CACHE_EXTENT) {
3318 /* block is already allocated */ 3345 /* block is already allocated */
3319 newblock = iblock 3346 newblock = map->m_lblk
3320 - le32_to_cpu(newex.ee_block) 3347 - le32_to_cpu(newex.ee_block)
3321 + ext_pblock(&newex); 3348 + ext_pblock(&newex);
3322 /* number of remaining blocks in the extent */ 3349 /* number of remaining blocks in the extent */
3323 allocated = ext4_ext_get_actual_len(&newex) - 3350 allocated = ext4_ext_get_actual_len(&newex) -
3324 (iblock - le32_to_cpu(newex.ee_block)); 3351 (map->m_lblk - le32_to_cpu(newex.ee_block));
3325 goto out; 3352 goto out;
3326 } else { 3353 } else {
3327 BUG(); 3354 BUG();
@@ -3329,7 +3356,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3329 } 3356 }
3330 3357
3331 /* find extent for this block */ 3358 /* find extent for this block */
3332 path = ext4_ext_find_extent(inode, iblock, NULL); 3359 path = ext4_ext_find_extent(inode, map->m_lblk, NULL);
3333 if (IS_ERR(path)) { 3360 if (IS_ERR(path)) {
3334 err = PTR_ERR(path); 3361 err = PTR_ERR(path);
3335 path = NULL; 3362 path = NULL;
@@ -3345,8 +3372,9 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3345 */ 3372 */
3346 if (unlikely(path[depth].p_ext == NULL && depth != 0)) { 3373 if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
3347 EXT4_ERROR_INODE(inode, "bad extent address " 3374 EXT4_ERROR_INODE(inode, "bad extent address "
3348 "iblock: %d, depth: %d pblock %lld", 3375 "lblock: %lu, depth: %d pblock %lld",
3349 iblock, depth, path[depth].p_block); 3376 (unsigned long) map->m_lblk, depth,
3377 path[depth].p_block);
3350 err = -EIO; 3378 err = -EIO;
3351 goto out2; 3379 goto out2;
3352 } 3380 }
@@ -3364,12 +3392,12 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3364 */ 3392 */
3365 ee_len = ext4_ext_get_actual_len(ex); 3393 ee_len = ext4_ext_get_actual_len(ex);
3366 /* if found extent covers block, simply return it */ 3394 /* if found extent covers block, simply return it */
3367 if (in_range(iblock, ee_block, ee_len)) { 3395 if (in_range(map->m_lblk, ee_block, ee_len)) {
3368 newblock = iblock - ee_block + ee_start; 3396 newblock = map->m_lblk - ee_block + ee_start;
3369 /* number of remaining blocks in the extent */ 3397 /* number of remaining blocks in the extent */
3370 allocated = ee_len - (iblock - ee_block); 3398 allocated = ee_len - (map->m_lblk - ee_block);
3371 ext_debug("%u fit into %u:%d -> %llu\n", iblock, 3399 ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
3372 ee_block, ee_len, newblock); 3400 ee_block, ee_len, newblock);
3373 3401
3374 /* Do not put uninitialized extent in the cache */ 3402 /* Do not put uninitialized extent in the cache */
3375 if (!ext4_ext_is_uninitialized(ex)) { 3403 if (!ext4_ext_is_uninitialized(ex)) {
@@ -3379,8 +3407,8 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3379 goto out; 3407 goto out;
3380 } 3408 }
3381 ret = ext4_ext_handle_uninitialized_extents(handle, 3409 ret = ext4_ext_handle_uninitialized_extents(handle,
3382 inode, iblock, max_blocks, path, 3410 inode, map, path, flags, allocated,
3383 flags, allocated, bh_result, newblock); 3411 newblock);
3384 return ret; 3412 return ret;
3385 } 3413 }
3386 } 3414 }
@@ -3394,7 +3422,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3394 * put just found gap into cache to speed up 3422 * put just found gap into cache to speed up
3395 * subsequent requests 3423 * subsequent requests
3396 */ 3424 */
3397 ext4_ext_put_gap_in_cache(inode, path, iblock); 3425 ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
3398 goto out2; 3426 goto out2;
3399 } 3427 }
3400 /* 3428 /*
@@ -3402,11 +3430,11 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3402 */ 3430 */
3403 3431
3404 /* find neighbour allocated blocks */ 3432 /* find neighbour allocated blocks */
3405 ar.lleft = iblock; 3433 ar.lleft = map->m_lblk;
3406 err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft); 3434 err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
3407 if (err) 3435 if (err)
3408 goto out2; 3436 goto out2;
3409 ar.lright = iblock; 3437 ar.lright = map->m_lblk;
3410 err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright); 3438 err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright);
3411 if (err) 3439 if (err)
3412 goto out2; 3440 goto out2;
@@ -3417,26 +3445,26 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3417 * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is 3445 * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is
3418 * EXT_UNINIT_MAX_LEN. 3446 * EXT_UNINIT_MAX_LEN.
3419 */ 3447 */
3420 if (max_blocks > EXT_INIT_MAX_LEN && 3448 if (map->m_len > EXT_INIT_MAX_LEN &&
3421 !(flags & EXT4_GET_BLOCKS_UNINIT_EXT)) 3449 !(flags & EXT4_GET_BLOCKS_UNINIT_EXT))
3422 max_blocks = EXT_INIT_MAX_LEN; 3450 map->m_len = EXT_INIT_MAX_LEN;
3423 else if (max_blocks > EXT_UNINIT_MAX_LEN && 3451 else if (map->m_len > EXT_UNINIT_MAX_LEN &&
3424 (flags & EXT4_GET_BLOCKS_UNINIT_EXT)) 3452 (flags & EXT4_GET_BLOCKS_UNINIT_EXT))
3425 max_blocks = EXT_UNINIT_MAX_LEN; 3453 map->m_len = EXT_UNINIT_MAX_LEN;
3426 3454
3427 /* Check if we can really insert (iblock)::(iblock+max_blocks) extent */ 3455 /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */
3428 newex.ee_block = cpu_to_le32(iblock); 3456 newex.ee_block = cpu_to_le32(map->m_lblk);
3429 newex.ee_len = cpu_to_le16(max_blocks); 3457 newex.ee_len = cpu_to_le16(map->m_len);
3430 err = ext4_ext_check_overlap(inode, &newex, path); 3458 err = ext4_ext_check_overlap(inode, &newex, path);
3431 if (err) 3459 if (err)
3432 allocated = ext4_ext_get_actual_len(&newex); 3460 allocated = ext4_ext_get_actual_len(&newex);
3433 else 3461 else
3434 allocated = max_blocks; 3462 allocated = map->m_len;
3435 3463
3436 /* allocate new block */ 3464 /* allocate new block */
3437 ar.inode = inode; 3465 ar.inode = inode;
3438 ar.goal = ext4_ext_find_goal(inode, path, iblock); 3466 ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk);
3439 ar.logical = iblock; 3467 ar.logical = map->m_lblk;
3440 ar.len = allocated; 3468 ar.len = allocated;
3441 if (S_ISREG(inode->i_mode)) 3469 if (S_ISREG(inode->i_mode))
3442 ar.flags = EXT4_MB_HINT_DATA; 3470 ar.flags = EXT4_MB_HINT_DATA;
@@ -3470,21 +3498,33 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3470 EXT4_STATE_DIO_UNWRITTEN); 3498 EXT4_STATE_DIO_UNWRITTEN);
3471 } 3499 }
3472 if (ext4_should_dioread_nolock(inode)) 3500 if (ext4_should_dioread_nolock(inode))
3473 set_buffer_uninit(bh_result); 3501 map->m_flags |= EXT4_MAP_UNINIT;
3474 } 3502 }
3475 3503
3476 if (unlikely(EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL)) { 3504 if (unlikely(ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))) {
3477 if (unlikely(!eh->eh_entries)) { 3505 if (unlikely(!eh->eh_entries)) {
3478 EXT4_ERROR_INODE(inode, 3506 EXT4_ERROR_INODE(inode,
3479 "eh->eh_entries == 0 ee_block %d", 3507 "eh->eh_entries == 0 and "
3480 ex->ee_block); 3508 "EOFBLOCKS_FL set");
3481 err = -EIO; 3509 err = -EIO;
3482 goto out2; 3510 goto out2;
3483 } 3511 }
3484 last_ex = EXT_LAST_EXTENT(eh); 3512 last_ex = EXT_LAST_EXTENT(eh);
3485 if (iblock + ar.len > le32_to_cpu(last_ex->ee_block) 3513 /*
3486 + ext4_ext_get_actual_len(last_ex)) 3514 * If the current leaf block was reached by looking at
3487 EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL; 3515 * the last index block all the way down the tree, and
3516 * we are extending the inode beyond the last extent
3517 * in the current leaf block, then clear the
3518 * EOFBLOCKS_FL flag.
3519 */
3520 for (i = depth-1; i >= 0; i--) {
3521 if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
3522 break;
3523 }
3524 if ((i < 0) &&
3525 (map->m_lblk + ar.len > le32_to_cpu(last_ex->ee_block) +
3526 ext4_ext_get_actual_len(last_ex)))
3527 ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
3488 } 3528 }
3489 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); 3529 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
3490 if (err) { 3530 if (err) {
@@ -3500,9 +3540,9 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3500 /* previous routine could use block we allocated */ 3540 /* previous routine could use block we allocated */
3501 newblock = ext_pblock(&newex); 3541 newblock = ext_pblock(&newex);
3502 allocated = ext4_ext_get_actual_len(&newex); 3542 allocated = ext4_ext_get_actual_len(&newex);
3503 if (allocated > max_blocks) 3543 if (allocated > map->m_len)
3504 allocated = max_blocks; 3544 allocated = map->m_len;
3505 set_buffer_new(bh_result); 3545 map->m_flags |= EXT4_MAP_NEW;
3506 3546
3507 /* 3547 /*
3508 * Update reserved blocks/metadata blocks after successful 3548 * Update reserved blocks/metadata blocks after successful
@@ -3516,18 +3556,18 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3516 * when it is _not_ an uninitialized extent. 3556 * when it is _not_ an uninitialized extent.
3517 */ 3557 */
3518 if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) { 3558 if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
3519 ext4_ext_put_in_cache(inode, iblock, allocated, newblock, 3559 ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock,
3520 EXT4_EXT_CACHE_EXTENT); 3560 EXT4_EXT_CACHE_EXTENT);
3521 ext4_update_inode_fsync_trans(handle, inode, 1); 3561 ext4_update_inode_fsync_trans(handle, inode, 1);
3522 } else 3562 } else
3523 ext4_update_inode_fsync_trans(handle, inode, 0); 3563 ext4_update_inode_fsync_trans(handle, inode, 0);
3524out: 3564out:
3525 if (allocated > max_blocks) 3565 if (allocated > map->m_len)
3526 allocated = max_blocks; 3566 allocated = map->m_len;
3527 ext4_ext_show_leaf(inode, path); 3567 ext4_ext_show_leaf(inode, path);
3528 set_buffer_mapped(bh_result); 3568 map->m_flags |= EXT4_MAP_MAPPED;
3529 bh_result->b_bdev = inode->i_sb->s_bdev; 3569 map->m_pblk = newblock;
3530 bh_result->b_blocknr = newblock; 3570 map->m_len = allocated;
3531out2: 3571out2:
3532 if (path) { 3572 if (path) {
3533 ext4_ext_drop_refs(path); 3573 ext4_ext_drop_refs(path);
@@ -3625,7 +3665,7 @@ static void ext4_falloc_update_inode(struct inode *inode,
3625 * can proceed even if the new size is the same as i_size. 3665 * can proceed even if the new size is the same as i_size.
3626 */ 3666 */
3627 if (new_size > i_size_read(inode)) 3667 if (new_size > i_size_read(inode))
3628 EXT4_I(inode)->i_flags |= EXT4_EOFBLOCKS_FL; 3668 ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
3629 } 3669 }
3630 3670
3631} 3671}
@@ -3640,55 +3680,57 @@ static void ext4_falloc_update_inode(struct inode *inode,
3640long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len) 3680long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
3641{ 3681{
3642 handle_t *handle; 3682 handle_t *handle;
3643 ext4_lblk_t block;
3644 loff_t new_size; 3683 loff_t new_size;
3645 unsigned int max_blocks; 3684 unsigned int max_blocks;
3646 int ret = 0; 3685 int ret = 0;
3647 int ret2 = 0; 3686 int ret2 = 0;
3648 int retries = 0; 3687 int retries = 0;
3649 struct buffer_head map_bh; 3688 struct ext4_map_blocks map;
3650 unsigned int credits, blkbits = inode->i_blkbits; 3689 unsigned int credits, blkbits = inode->i_blkbits;
3651 3690
3652 /* 3691 /*
3653 * currently supporting (pre)allocate mode for extent-based 3692 * currently supporting (pre)allocate mode for extent-based
3654 * files _only_ 3693 * files _only_
3655 */ 3694 */
3656 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) 3695 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
3657 return -EOPNOTSUPP; 3696 return -EOPNOTSUPP;
3658 3697
3659 /* preallocation to directories is currently not supported */ 3698 /* preallocation to directories is currently not supported */
3660 if (S_ISDIR(inode->i_mode)) 3699 if (S_ISDIR(inode->i_mode))
3661 return -ENODEV; 3700 return -ENODEV;
3662 3701
3663 block = offset >> blkbits; 3702 map.m_lblk = offset >> blkbits;
3664 /* 3703 /*
3665 * We can't just convert len to max_blocks because 3704 * We can't just convert len to max_blocks because
3666 * If blocksize = 4096 offset = 3072 and len = 2048 3705 * If blocksize = 4096 offset = 3072 and len = 2048
3667 */ 3706 */
3668 max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) 3707 max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
3669 - block; 3708 - map.m_lblk;
3670 /* 3709 /*
3671 * credits to insert 1 extent into extent tree 3710 * credits to insert 1 extent into extent tree
3672 */ 3711 */
3673 credits = ext4_chunk_trans_blocks(inode, max_blocks); 3712 credits = ext4_chunk_trans_blocks(inode, max_blocks);
3674 mutex_lock(&inode->i_mutex); 3713 mutex_lock(&inode->i_mutex);
3714 ret = inode_newsize_ok(inode, (len + offset));
3715 if (ret) {
3716 mutex_unlock(&inode->i_mutex);
3717 return ret;
3718 }
3675retry: 3719retry:
3676 while (ret >= 0 && ret < max_blocks) { 3720 while (ret >= 0 && ret < max_blocks) {
3677 block = block + ret; 3721 map.m_lblk = map.m_lblk + ret;
3678 max_blocks = max_blocks - ret; 3722 map.m_len = max_blocks = max_blocks - ret;
3679 handle = ext4_journal_start(inode, credits); 3723 handle = ext4_journal_start(inode, credits);
3680 if (IS_ERR(handle)) { 3724 if (IS_ERR(handle)) {
3681 ret = PTR_ERR(handle); 3725 ret = PTR_ERR(handle);
3682 break; 3726 break;
3683 } 3727 }
3684 map_bh.b_state = 0; 3728 ret = ext4_map_blocks(handle, inode, &map,
3685 ret = ext4_get_blocks(handle, inode, block,
3686 max_blocks, &map_bh,
3687 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT); 3729 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT);
3688 if (ret <= 0) { 3730 if (ret <= 0) {
3689#ifdef EXT4FS_DEBUG 3731#ifdef EXT4FS_DEBUG
3690 WARN_ON(ret <= 0); 3732 WARN_ON(ret <= 0);
3691 printk(KERN_ERR "%s: ext4_ext_get_blocks " 3733 printk(KERN_ERR "%s: ext4_ext_map_blocks "
3692 "returned error inode#%lu, block=%u, " 3734 "returned error inode#%lu, block=%u, "
3693 "max_blocks=%u", __func__, 3735 "max_blocks=%u", __func__,
3694 inode->i_ino, block, max_blocks); 3736 inode->i_ino, block, max_blocks);
@@ -3697,14 +3739,14 @@ retry:
3697 ret2 = ext4_journal_stop(handle); 3739 ret2 = ext4_journal_stop(handle);
3698 break; 3740 break;
3699 } 3741 }
3700 if ((block + ret) >= (EXT4_BLOCK_ALIGN(offset + len, 3742 if ((map.m_lblk + ret) >= (EXT4_BLOCK_ALIGN(offset + len,
3701 blkbits) >> blkbits)) 3743 blkbits) >> blkbits))
3702 new_size = offset + len; 3744 new_size = offset + len;
3703 else 3745 else
3704 new_size = (block + ret) << blkbits; 3746 new_size = (map.m_lblk + ret) << blkbits;
3705 3747
3706 ext4_falloc_update_inode(inode, mode, new_size, 3748 ext4_falloc_update_inode(inode, mode, new_size,
3707 buffer_new(&map_bh)); 3749 (map.m_flags & EXT4_MAP_NEW));
3708 ext4_mark_inode_dirty(handle, inode); 3750 ext4_mark_inode_dirty(handle, inode);
3709 ret2 = ext4_journal_stop(handle); 3751 ret2 = ext4_journal_stop(handle);
3710 if (ret2) 3752 if (ret2)
@@ -3733,42 +3775,39 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
3733 ssize_t len) 3775 ssize_t len)
3734{ 3776{
3735 handle_t *handle; 3777 handle_t *handle;
3736 ext4_lblk_t block;
3737 unsigned int max_blocks; 3778 unsigned int max_blocks;
3738 int ret = 0; 3779 int ret = 0;
3739 int ret2 = 0; 3780 int ret2 = 0;
3740 struct buffer_head map_bh; 3781 struct ext4_map_blocks map;
3741 unsigned int credits, blkbits = inode->i_blkbits; 3782 unsigned int credits, blkbits = inode->i_blkbits;
3742 3783
3743 block = offset >> blkbits; 3784 map.m_lblk = offset >> blkbits;
3744 /* 3785 /*
3745 * We can't just convert len to max_blocks because 3786 * We can't just convert len to max_blocks because
3746 * If blocksize = 4096 offset = 3072 and len = 2048 3787 * If blocksize = 4096 offset = 3072 and len = 2048
3747 */ 3788 */
3748 max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) 3789 max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) -
3749 - block; 3790 map.m_lblk);
3750 /* 3791 /*
3751 * credits to insert 1 extent into extent tree 3792 * credits to insert 1 extent into extent tree
3752 */ 3793 */
3753 credits = ext4_chunk_trans_blocks(inode, max_blocks); 3794 credits = ext4_chunk_trans_blocks(inode, max_blocks);
3754 while (ret >= 0 && ret < max_blocks) { 3795 while (ret >= 0 && ret < max_blocks) {
3755 block = block + ret; 3796 map.m_lblk += ret;
3756 max_blocks = max_blocks - ret; 3797 map.m_len = (max_blocks -= ret);
3757 handle = ext4_journal_start(inode, credits); 3798 handle = ext4_journal_start(inode, credits);
3758 if (IS_ERR(handle)) { 3799 if (IS_ERR(handle)) {
3759 ret = PTR_ERR(handle); 3800 ret = PTR_ERR(handle);
3760 break; 3801 break;
3761 } 3802 }
3762 map_bh.b_state = 0; 3803 ret = ext4_map_blocks(handle, inode, &map,
3763 ret = ext4_get_blocks(handle, inode, block,
3764 max_blocks, &map_bh,
3765 EXT4_GET_BLOCKS_IO_CONVERT_EXT); 3804 EXT4_GET_BLOCKS_IO_CONVERT_EXT);
3766 if (ret <= 0) { 3805 if (ret <= 0) {
3767 WARN_ON(ret <= 0); 3806 WARN_ON(ret <= 0);
3768 printk(KERN_ERR "%s: ext4_ext_get_blocks " 3807 printk(KERN_ERR "%s: ext4_ext_map_blocks "
3769 "returned error inode#%lu, block=%u, " 3808 "returned error inode#%lu, block=%u, "
3770 "max_blocks=%u", __func__, 3809 "max_blocks=%u", __func__,
3771 inode->i_ino, block, max_blocks); 3810 inode->i_ino, map.m_lblk, map.m_len);
3772 } 3811 }
3773 ext4_mark_inode_dirty(handle, inode); 3812 ext4_mark_inode_dirty(handle, inode);
3774 ret2 = ext4_journal_stop(handle); 3813 ret2 = ext4_journal_stop(handle);
@@ -3879,6 +3918,7 @@ static int ext4_xattr_fiemap(struct inode *inode,
3879 physical += offset; 3918 physical += offset;
3880 length = EXT4_SB(inode->i_sb)->s_inode_size - offset; 3919 length = EXT4_SB(inode->i_sb)->s_inode_size - offset;
3881 flags |= FIEMAP_EXTENT_DATA_INLINE; 3920 flags |= FIEMAP_EXTENT_DATA_INLINE;
3921 brelse(iloc.bh);
3882 } else { /* external block */ 3922 } else { /* external block */
3883 physical = EXT4_I(inode)->i_file_acl << blockbits; 3923 physical = EXT4_I(inode)->i_file_acl << blockbits;
3884 length = inode->i_sb->s_blocksize; 3924 length = inode->i_sb->s_blocksize;
@@ -3897,7 +3937,7 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3897 int error = 0; 3937 int error = 0;
3898 3938
3899 /* fallback to generic here if not in extents fmt */ 3939 /* fallback to generic here if not in extents fmt */
3900 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) 3940 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
3901 return generic_block_fiemap(inode, fieinfo, start, len, 3941 return generic_block_fiemap(inode, fieinfo, start, len,
3902 ext4_get_block); 3942 ext4_get_block);
3903 3943
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index d0776e410f34..5313ae4cda2d 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -66,7 +66,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
66 * is smaller than s_maxbytes, which is for extent-mapped files. 66 * is smaller than s_maxbytes, which is for extent-mapped files.
67 */ 67 */
68 68
69 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) { 69 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
70 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 70 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
71 size_t length = iov_length(iov, nr_segs); 71 size_t length = iov_length(iov, nr_segs);
72 72
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 0d0c3239c1cd..592adf2e546e 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -35,6 +35,29 @@
35#include <trace/events/ext4.h> 35#include <trace/events/ext4.h>
36 36
37/* 37/*
38 * If we're not journaling and this is a just-created file, we have to
39 * sync our parent directory (if it was freshly created) since
40 * otherwise it will only be written by writeback, leaving a huge
41 * window during which a crash may lose the file. This may apply for
42 * the parent directory's parent as well, and so on recursively, if
43 * they are also freshly created.
44 */
45static void ext4_sync_parent(struct inode *inode)
46{
47 struct dentry *dentry = NULL;
48
49 while (inode && ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) {
50 ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY);
51 dentry = list_entry(inode->i_dentry.next,
52 struct dentry, d_alias);
53 if (!dentry || !dentry->d_parent || !dentry->d_parent->d_inode)
54 break;
55 inode = dentry->d_parent->d_inode;
56 sync_mapping_buffers(inode->i_mapping);
57 }
58}
59
60/*
38 * akpm: A new design for ext4_sync_file(). 61 * akpm: A new design for ext4_sync_file().
39 * 62 *
40 * This is only called from sys_fsync(), sys_fdatasync() and sys_msync(). 63 * This is only called from sys_fsync(), sys_fdatasync() and sys_msync().
@@ -48,9 +71,9 @@
48 * i_mutex lock is held when entering and exiting this function 71 * i_mutex lock is held when entering and exiting this function
49 */ 72 */
50 73
51int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) 74int ext4_sync_file(struct file *file, int datasync)
52{ 75{
53 struct inode *inode = dentry->d_inode; 76 struct inode *inode = file->f_mapping->host;
54 struct ext4_inode_info *ei = EXT4_I(inode); 77 struct ext4_inode_info *ei = EXT4_I(inode);
55 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; 78 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
56 int ret; 79 int ret;
@@ -58,7 +81,7 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
58 81
59 J_ASSERT(ext4_journal_current_handle() == NULL); 82 J_ASSERT(ext4_journal_current_handle() == NULL);
60 83
61 trace_ext4_sync_file(file, dentry, datasync); 84 trace_ext4_sync_file(file, datasync);
62 85
63 if (inode->i_sb->s_flags & MS_RDONLY) 86 if (inode->i_sb->s_flags & MS_RDONLY)
64 return 0; 87 return 0;
@@ -66,9 +89,13 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
66 ret = flush_completed_IO(inode); 89 ret = flush_completed_IO(inode);
67 if (ret < 0) 90 if (ret < 0)
68 return ret; 91 return ret;
69 92
70 if (!journal) 93 if (!journal) {
71 return simple_fsync(file, dentry, datasync); 94 ret = generic_file_fsync(file, datasync);
95 if (!ret && !list_empty(&inode->i_dentry))
96 ext4_sync_parent(inode);
97 return ret;
98 }
72 99
73 /* 100 /*
74 * data=writeback,ordered: 101 * data=writeback,ordered:
@@ -100,9 +127,11 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
100 if (ext4_should_writeback_data(inode) && 127 if (ext4_should_writeback_data(inode) &&
101 (journal->j_fs_dev != journal->j_dev) && 128 (journal->j_fs_dev != journal->j_dev) &&
102 (journal->j_flags & JBD2_BARRIER)) 129 (journal->j_flags & JBD2_BARRIER))
103 blkdev_issue_flush(inode->i_sb->s_bdev, NULL); 130 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
104 jbd2_log_wait_commit(journal, commit_tid); 131 NULL, BLKDEV_IFL_WAIT);
132 ret = jbd2_log_wait_commit(journal, commit_tid);
105 } else if (journal->j_flags & JBD2_BARRIER) 133 } else if (journal->j_flags & JBD2_BARRIER)
106 blkdev_issue_flush(inode->i_sb->s_bdev, NULL); 134 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
135 BLKDEV_IFL_WAIT);
107 return ret; 136 return ret;
108} 137}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 361c0b9962a8..25c4b3173fd9 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -240,56 +240,49 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
240 if (fatal) 240 if (fatal)
241 goto error_return; 241 goto error_return;
242 242
243 /* Ok, now we can actually update the inode bitmaps.. */ 243 fatal = -ESRCH;
244 cleared = ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group), 244 gdp = ext4_get_group_desc(sb, block_group, &bh2);
245 bit, bitmap_bh->b_data); 245 if (gdp) {
246 if (!cleared)
247 ext4_error(sb, "bit already cleared for inode %lu", ino);
248 else {
249 gdp = ext4_get_group_desc(sb, block_group, &bh2);
250
251 BUFFER_TRACE(bh2, "get_write_access"); 246 BUFFER_TRACE(bh2, "get_write_access");
252 fatal = ext4_journal_get_write_access(handle, bh2); 247 fatal = ext4_journal_get_write_access(handle, bh2);
253 if (fatal) goto error_return; 248 }
254 249 ext4_lock_group(sb, block_group);
255 if (gdp) { 250 cleared = ext4_clear_bit(bit, bitmap_bh->b_data);
256 ext4_lock_group(sb, block_group); 251 if (fatal || !cleared) {
257 count = ext4_free_inodes_count(sb, gdp) + 1; 252 ext4_unlock_group(sb, block_group);
258 ext4_free_inodes_set(sb, gdp, count); 253 goto out;
259 if (is_directory) { 254 }
260 count = ext4_used_dirs_count(sb, gdp) - 1;
261 ext4_used_dirs_set(sb, gdp, count);
262 if (sbi->s_log_groups_per_flex) {
263 ext4_group_t f;
264
265 f = ext4_flex_group(sbi, block_group);
266 atomic_dec(&sbi->s_flex_groups[f].free_inodes);
267 }
268 255
269 } 256 count = ext4_free_inodes_count(sb, gdp) + 1;
270 gdp->bg_checksum = ext4_group_desc_csum(sbi, 257 ext4_free_inodes_set(sb, gdp, count);
271 block_group, gdp); 258 if (is_directory) {
272 ext4_unlock_group(sb, block_group); 259 count = ext4_used_dirs_count(sb, gdp) - 1;
273 percpu_counter_inc(&sbi->s_freeinodes_counter); 260 ext4_used_dirs_set(sb, gdp, count);
274 if (is_directory) 261 percpu_counter_dec(&sbi->s_dirs_counter);
275 percpu_counter_dec(&sbi->s_dirs_counter);
276
277 if (sbi->s_log_groups_per_flex) {
278 ext4_group_t f;
279
280 f = ext4_flex_group(sbi, block_group);
281 atomic_inc(&sbi->s_flex_groups[f].free_inodes);
282 }
283 }
284 BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
285 err = ext4_handle_dirty_metadata(handle, NULL, bh2);
286 if (!fatal) fatal = err;
287 } 262 }
288 BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata"); 263 gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
289 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); 264 ext4_unlock_group(sb, block_group);
290 if (!fatal) 265
291 fatal = err; 266 percpu_counter_inc(&sbi->s_freeinodes_counter);
292 sb->s_dirt = 1; 267 if (sbi->s_log_groups_per_flex) {
268 ext4_group_t f = ext4_flex_group(sbi, block_group);
269
270 atomic_inc(&sbi->s_flex_groups[f].free_inodes);
271 if (is_directory)
272 atomic_dec(&sbi->s_flex_groups[f].used_dirs);
273 }
274 BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
275 fatal = ext4_handle_dirty_metadata(handle, NULL, bh2);
276out:
277 if (cleared) {
278 BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata");
279 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
280 if (!fatal)
281 fatal = err;
282 sb->s_dirt = 1;
283 } else
284 ext4_error(sb, "bit already cleared for inode %lu", ino);
285
293error_return: 286error_return:
294 brelse(bitmap_bh); 287 brelse(bitmap_bh);
295 ext4_std_error(sb, fatal); 288 ext4_std_error(sb, fatal);
@@ -499,7 +492,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
499 492
500 if (S_ISDIR(mode) && 493 if (S_ISDIR(mode) &&
501 ((parent == sb->s_root->d_inode) || 494 ((parent == sb->s_root->d_inode) ||
502 (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL))) { 495 (ext4_test_inode_flag(parent, EXT4_INODE_TOPDIR)))) {
503 int best_ndir = inodes_per_group; 496 int best_ndir = inodes_per_group;
504 int ret = -1; 497 int ret = -1;
505 498
@@ -773,7 +766,7 @@ static int ext4_claim_inode(struct super_block *sb,
773 if (sbi->s_log_groups_per_flex) { 766 if (sbi->s_log_groups_per_flex) {
774 ext4_group_t f = ext4_flex_group(sbi, group); 767 ext4_group_t f = ext4_flex_group(sbi, group);
775 768
776 atomic_inc(&sbi->s_flex_groups[f].free_inodes); 769 atomic_inc(&sbi->s_flex_groups[f].used_dirs);
777 } 770 }
778 } 771 }
779 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); 772 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
@@ -979,16 +972,12 @@ got:
979 atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes); 972 atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes);
980 } 973 }
981 974
982 inode->i_uid = current_fsuid(); 975 if (test_opt(sb, GRPID)) {
983 if (test_opt(sb, GRPID)) 976 inode->i_mode = mode;
984 inode->i_gid = dir->i_gid; 977 inode->i_uid = current_fsuid();
985 else if (dir->i_mode & S_ISGID) {
986 inode->i_gid = dir->i_gid; 978 inode->i_gid = dir->i_gid;
987 if (S_ISDIR(mode))
988 mode |= S_ISGID;
989 } else 979 } else
990 inode->i_gid = current_fsgid(); 980 inode_init_owner(inode, dir, mode);
991 inode->i_mode = mode;
992 981
993 inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb); 982 inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb);
994 /* This is the optimal IO size (for stat), not the fs block size */ 983 /* This is the optimal IO size (for stat), not the fs block size */
@@ -1045,7 +1034,7 @@ got:
1045 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { 1034 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
1046 /* set extent flag only for directory, file and normal symlink*/ 1035 /* set extent flag only for directory, file and normal symlink*/
1047 if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) { 1036 if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
1048 EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL; 1037 ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
1049 ext4_ext_tree_init(handle, inode); 1038 ext4_ext_tree_init(handle, inode);
1050 } 1039 }
1051 } 1040 }
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 986120f30066..19df61c321fd 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -39,6 +39,7 @@
39#include <linux/bio.h> 39#include <linux/bio.h>
40#include <linux/workqueue.h> 40#include <linux/workqueue.h>
41#include <linux/kernel.h> 41#include <linux/kernel.h>
42#include <linux/slab.h>
42 43
43#include "ext4_jbd2.h" 44#include "ext4_jbd2.h"
44#include "xattr.h" 45#include "xattr.h"
@@ -148,7 +149,7 @@ int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
148 int ret; 149 int ret;
149 150
150 /* 151 /*
151 * Drop i_data_sem to avoid deadlock with ext4_get_blocks At this 152 * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this
152 * moment, get_block can be called only for blocks inside i_size since 153 * moment, get_block can be called only for blocks inside i_size since
153 * page cache has been already dropped and writes are blocked by 154 * page cache has been already dropped and writes are blocked by
154 * i_mutex. So we can safely drop the i_data_sem here. 155 * i_mutex. So we can safely drop the i_data_sem here.
@@ -347,9 +348,8 @@ static int __ext4_check_blockref(const char *function, struct inode *inode,
347 if (blk && 348 if (blk &&
348 unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), 349 unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
349 blk, 1))) { 350 blk, 1))) {
350 __ext4_error(inode->i_sb, function, 351 ext4_error_inode(function, inode,
351 "invalid block reference %u " 352 "invalid block reference %u", blk);
352 "in inode #%lu", blk, inode->i_ino);
353 return -EIO; 353 return -EIO;
354 } 354 }
355 } 355 }
@@ -784,7 +784,7 @@ failed:
784 /* Allocation failed, free what we already allocated */ 784 /* Allocation failed, free what we already allocated */
785 ext4_free_blocks(handle, inode, 0, new_blocks[0], 1, 0); 785 ext4_free_blocks(handle, inode, 0, new_blocks[0], 1, 0);
786 for (i = 1; i <= n ; i++) { 786 for (i = 1; i <= n ; i++) {
787 /* 787 /*
788 * branch[i].bh is newly allocated, so there is no 788 * branch[i].bh is newly allocated, so there is no
789 * need to revoke the block, which is why we don't 789 * need to revoke the block, which is why we don't
790 * need to set EXT4_FREE_BLOCKS_METADATA. 790 * need to set EXT4_FREE_BLOCKS_METADATA.
@@ -874,7 +874,7 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
874 874
875err_out: 875err_out:
876 for (i = 1; i <= num; i++) { 876 for (i = 1; i <= num; i++) {
877 /* 877 /*
878 * branch[i].bh is newly allocated, so there is no 878 * branch[i].bh is newly allocated, so there is no
879 * need to revoke the block, which is why we don't 879 * need to revoke the block, which is why we don't
880 * need to set EXT4_FREE_BLOCKS_METADATA. 880 * need to set EXT4_FREE_BLOCKS_METADATA.
@@ -889,9 +889,9 @@ err_out:
889} 889}
890 890
891/* 891/*
892 * The ext4_ind_get_blocks() function handles non-extents inodes 892 * The ext4_ind_map_blocks() function handles non-extents inodes
893 * (i.e., using the traditional indirect/double-indirect i_blocks 893 * (i.e., using the traditional indirect/double-indirect i_blocks
894 * scheme) for ext4_get_blocks(). 894 * scheme) for ext4_map_blocks().
895 * 895 *
896 * Allocation strategy is simple: if we have to allocate something, we will 896 * Allocation strategy is simple: if we have to allocate something, we will
897 * have to go the whole way to leaf. So let's do it before attaching anything 897 * have to go the whole way to leaf. So let's do it before attaching anything
@@ -916,9 +916,8 @@ err_out:
916 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system 916 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system
917 * blocks. 917 * blocks.
918 */ 918 */
919static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode, 919static int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
920 ext4_lblk_t iblock, unsigned int maxblocks, 920 struct ext4_map_blocks *map,
921 struct buffer_head *bh_result,
922 int flags) 921 int flags)
923{ 922{
924 int err = -EIO; 923 int err = -EIO;
@@ -932,9 +931,9 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
932 int count = 0; 931 int count = 0;
933 ext4_fsblk_t first_block = 0; 932 ext4_fsblk_t first_block = 0;
934 933
935 J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)); 934 J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
936 J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); 935 J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
937 depth = ext4_block_to_path(inode, iblock, offsets, 936 depth = ext4_block_to_path(inode, map->m_lblk, offsets,
938 &blocks_to_boundary); 937 &blocks_to_boundary);
939 938
940 if (depth == 0) 939 if (depth == 0)
@@ -945,10 +944,9 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
945 /* Simplest case - block found, no allocation needed */ 944 /* Simplest case - block found, no allocation needed */
946 if (!partial) { 945 if (!partial) {
947 first_block = le32_to_cpu(chain[depth - 1].key); 946 first_block = le32_to_cpu(chain[depth - 1].key);
948 clear_buffer_new(bh_result);
949 count++; 947 count++;
950 /*map more blocks*/ 948 /*map more blocks*/
951 while (count < maxblocks && count <= blocks_to_boundary) { 949 while (count < map->m_len && count <= blocks_to_boundary) {
952 ext4_fsblk_t blk; 950 ext4_fsblk_t blk;
953 951
954 blk = le32_to_cpu(*(chain[depth-1].p + count)); 952 blk = le32_to_cpu(*(chain[depth-1].p + count));
@@ -968,7 +966,7 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
968 /* 966 /*
969 * Okay, we need to do block allocation. 967 * Okay, we need to do block allocation.
970 */ 968 */
971 goal = ext4_find_goal(inode, iblock, partial); 969 goal = ext4_find_goal(inode, map->m_lblk, partial);
972 970
973 /* the number of blocks need to allocate for [d,t]indirect blocks */ 971 /* the number of blocks need to allocate for [d,t]indirect blocks */
974 indirect_blks = (chain + depth) - partial - 1; 972 indirect_blks = (chain + depth) - partial - 1;
@@ -978,11 +976,11 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
978 * direct blocks to allocate for this branch. 976 * direct blocks to allocate for this branch.
979 */ 977 */
980 count = ext4_blks_to_allocate(partial, indirect_blks, 978 count = ext4_blks_to_allocate(partial, indirect_blks,
981 maxblocks, blocks_to_boundary); 979 map->m_len, blocks_to_boundary);
982 /* 980 /*
983 * Block out ext4_truncate while we alter the tree 981 * Block out ext4_truncate while we alter the tree
984 */ 982 */
985 err = ext4_alloc_branch(handle, inode, iblock, indirect_blks, 983 err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks,
986 &count, goal, 984 &count, goal,
987 offsets + (partial - chain), partial); 985 offsets + (partial - chain), partial);
988 986
@@ -994,18 +992,20 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
994 * may need to return -EAGAIN upwards in the worst case. --sct 992 * may need to return -EAGAIN upwards in the worst case. --sct
995 */ 993 */
996 if (!err) 994 if (!err)
997 err = ext4_splice_branch(handle, inode, iblock, 995 err = ext4_splice_branch(handle, inode, map->m_lblk,
998 partial, indirect_blks, count); 996 partial, indirect_blks, count);
999 if (err) 997 if (err)
1000 goto cleanup; 998 goto cleanup;
1001 999
1002 set_buffer_new(bh_result); 1000 map->m_flags |= EXT4_MAP_NEW;
1003 1001
1004 ext4_update_inode_fsync_trans(handle, inode, 1); 1002 ext4_update_inode_fsync_trans(handle, inode, 1);
1005got_it: 1003got_it:
1006 map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); 1004 map->m_flags |= EXT4_MAP_MAPPED;
1005 map->m_pblk = le32_to_cpu(chain[depth-1].key);
1006 map->m_len = count;
1007 if (count > blocks_to_boundary) 1007 if (count > blocks_to_boundary)
1008 set_buffer_boundary(bh_result); 1008 map->m_flags |= EXT4_MAP_BOUNDARY;
1009 err = count; 1009 err = count;
1010 /* Clean up and exit */ 1010 /* Clean up and exit */
1011 partial = chain + depth - 1; /* the whole chain */ 1011 partial = chain + depth - 1; /* the whole chain */
@@ -1015,7 +1015,6 @@ cleanup:
1015 brelse(partial->bh); 1015 brelse(partial->bh);
1016 partial--; 1016 partial--;
1017 } 1017 }
1018 BUFFER_TRACE(bh_result, "returned");
1019out: 1018out:
1020 return err; 1019 return err;
1021} 1020}
@@ -1035,7 +1034,7 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode,
1035 sector_t lblock) 1034 sector_t lblock)
1036{ 1035{
1037 struct ext4_inode_info *ei = EXT4_I(inode); 1036 struct ext4_inode_info *ei = EXT4_I(inode);
1038 int dind_mask = EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1; 1037 sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1);
1039 int blk_bits; 1038 int blk_bits;
1040 1039
1041 if (lblock < EXT4_NDIR_BLOCKS) 1040 if (lblock < EXT4_NDIR_BLOCKS)
@@ -1050,7 +1049,7 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode,
1050 } 1049 }
1051 ei->i_da_metadata_calc_last_lblock = lblock & dind_mask; 1050 ei->i_da_metadata_calc_last_lblock = lblock & dind_mask;
1052 ei->i_da_metadata_calc_len = 1; 1051 ei->i_da_metadata_calc_len = 1;
1053 blk_bits = roundup_pow_of_two(lblock + 1); 1052 blk_bits = order_base_2(lblock);
1054 return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1; 1053 return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
1055} 1054}
1056 1055
@@ -1060,7 +1059,7 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode,
1060 */ 1059 */
1061static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock) 1060static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock)
1062{ 1061{
1063 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) 1062 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
1064 return ext4_ext_calc_metadata_amount(inode, lblock); 1063 return ext4_ext_calc_metadata_amount(inode, lblock);
1065 1064
1066 return ext4_indirect_calc_metadata_amount(inode, lblock); 1065 return ext4_indirect_calc_metadata_amount(inode, lblock);
@@ -1075,7 +1074,6 @@ void ext4_da_update_reserve_space(struct inode *inode,
1075{ 1074{
1076 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1075 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1077 struct ext4_inode_info *ei = EXT4_I(inode); 1076 struct ext4_inode_info *ei = EXT4_I(inode);
1078 int mdb_free = 0, allocated_meta_blocks = 0;
1079 1077
1080 spin_lock(&ei->i_block_reservation_lock); 1078 spin_lock(&ei->i_block_reservation_lock);
1081 trace_ext4_da_update_reserve_space(inode, used); 1079 trace_ext4_da_update_reserve_space(inode, used);
@@ -1090,11 +1088,10 @@ void ext4_da_update_reserve_space(struct inode *inode,
1090 1088
1091 /* Update per-inode reservations */ 1089 /* Update per-inode reservations */
1092 ei->i_reserved_data_blocks -= used; 1090 ei->i_reserved_data_blocks -= used;
1093 used += ei->i_allocated_meta_blocks;
1094 ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks; 1091 ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
1095 allocated_meta_blocks = ei->i_allocated_meta_blocks; 1092 percpu_counter_sub(&sbi->s_dirtyblocks_counter,
1093 used + ei->i_allocated_meta_blocks);
1096 ei->i_allocated_meta_blocks = 0; 1094 ei->i_allocated_meta_blocks = 0;
1097 percpu_counter_sub(&sbi->s_dirtyblocks_counter, used);
1098 1095
1099 if (ei->i_reserved_data_blocks == 0) { 1096 if (ei->i_reserved_data_blocks == 0) {
1100 /* 1097 /*
@@ -1102,30 +1099,23 @@ void ext4_da_update_reserve_space(struct inode *inode,
1102 * only when we have written all of the delayed 1099 * only when we have written all of the delayed
1103 * allocation blocks. 1100 * allocation blocks.
1104 */ 1101 */
1105 mdb_free = ei->i_reserved_meta_blocks; 1102 percpu_counter_sub(&sbi->s_dirtyblocks_counter,
1103 ei->i_reserved_meta_blocks);
1106 ei->i_reserved_meta_blocks = 0; 1104 ei->i_reserved_meta_blocks = 0;
1107 ei->i_da_metadata_calc_len = 0; 1105 ei->i_da_metadata_calc_len = 0;
1108 percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free);
1109 } 1106 }
1110 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1107 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1111 1108
1112 /* Update quota subsystem */ 1109 /* Update quota subsystem for data blocks */
1113 if (quota_claim) { 1110 if (quota_claim)
1114 dquot_claim_block(inode, used); 1111 dquot_claim_block(inode, used);
1115 if (mdb_free) 1112 else {
1116 dquot_release_reservation_block(inode, mdb_free);
1117 } else {
1118 /* 1113 /*
1119 * We did fallocate with an offset that is already delayed 1114 * We did fallocate with an offset that is already delayed
1120 * allocated. So on delayed allocated writeback we should 1115 * allocated. So on delayed allocated writeback we should
1121 * not update the quota for allocated blocks. But then 1116 * not re-claim the quota for fallocated blocks.
1122 * converting an fallocate region to initialized region would
1123 * have caused a metadata allocation. So claim quota for
1124 * that
1125 */ 1117 */
1126 if (allocated_meta_blocks) 1118 dquot_release_reservation_block(inode, used);
1127 dquot_claim_block(inode, allocated_meta_blocks);
1128 dquot_release_reservation_block(inode, mdb_free + used);
1129 } 1119 }
1130 1120
1131 /* 1121 /*
@@ -1138,15 +1128,15 @@ void ext4_da_update_reserve_space(struct inode *inode,
1138 ext4_discard_preallocations(inode); 1128 ext4_discard_preallocations(inode);
1139} 1129}
1140 1130
1141static int check_block_validity(struct inode *inode, const char *msg, 1131static int check_block_validity(struct inode *inode, const char *func,
1142 sector_t logical, sector_t phys, int len) 1132 struct ext4_map_blocks *map)
1143{ 1133{
1144 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) { 1134 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk,
1145 __ext4_error(inode->i_sb, msg, 1135 map->m_len)) {
1146 "inode #%lu logical block %llu mapped to %llu " 1136 ext4_error_inode(func, inode,
1147 "(size %d)", inode->i_ino, 1137 "lblock %lu mapped to illegal pblock %llu "
1148 (unsigned long long) logical, 1138 "(length %d)", (unsigned long) map->m_lblk,
1149 (unsigned long long) phys, len); 1139 map->m_pblk, map->m_len);
1150 return -EIO; 1140 return -EIO;
1151 } 1141 }
1152 return 0; 1142 return 0;
@@ -1211,15 +1201,15 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
1211} 1201}
1212 1202
1213/* 1203/*
1214 * The ext4_get_blocks() function tries to look up the requested blocks, 1204 * The ext4_map_blocks() function tries to look up the requested blocks,
1215 * and returns if the blocks are already mapped. 1205 * and returns if the blocks are already mapped.
1216 * 1206 *
1217 * Otherwise it takes the write lock of the i_data_sem and allocate blocks 1207 * Otherwise it takes the write lock of the i_data_sem and allocate blocks
1218 * and store the allocated blocks in the result buffer head and mark it 1208 * and store the allocated blocks in the result buffer head and mark it
1219 * mapped. 1209 * mapped.
1220 * 1210 *
1221 * If file type is extents based, it will call ext4_ext_get_blocks(), 1211 * If file type is extents based, it will call ext4_ext_map_blocks(),
1222 * Otherwise, call with ext4_ind_get_blocks() to handle indirect mapping 1212 * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping
1223 * based files 1213 * based files
1224 * 1214 *
1225 * On success, it returns the number of blocks being mapped or allocate. 1215 * On success, it returns the number of blocks being mapped or allocate.
@@ -1232,35 +1222,29 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
1232 * 1222 *
1233 * It returns the error in case of allocation failure. 1223 * It returns the error in case of allocation failure.
1234 */ 1224 */
1235int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, 1225int ext4_map_blocks(handle_t *handle, struct inode *inode,
1236 unsigned int max_blocks, struct buffer_head *bh, 1226 struct ext4_map_blocks *map, int flags)
1237 int flags)
1238{ 1227{
1239 int retval; 1228 int retval;
1240 1229
1241 clear_buffer_mapped(bh); 1230 map->m_flags = 0;
1242 clear_buffer_unwritten(bh); 1231 ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u,"
1243 1232 "logical block %lu\n", inode->i_ino, flags, map->m_len,
1244 ext_debug("ext4_get_blocks(): inode %lu, flag %d, max_blocks %u," 1233 (unsigned long) map->m_lblk);
1245 "logical block %lu\n", inode->i_ino, flags, max_blocks,
1246 (unsigned long)block);
1247 /* 1234 /*
1248 * Try to see if we can get the block without requesting a new 1235 * Try to see if we can get the block without requesting a new
1249 * file system block. 1236 * file system block.
1250 */ 1237 */
1251 down_read((&EXT4_I(inode)->i_data_sem)); 1238 down_read((&EXT4_I(inode)->i_data_sem));
1252 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 1239 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
1253 retval = ext4_ext_get_blocks(handle, inode, block, max_blocks, 1240 retval = ext4_ext_map_blocks(handle, inode, map, 0);
1254 bh, 0);
1255 } else { 1241 } else {
1256 retval = ext4_ind_get_blocks(handle, inode, block, max_blocks, 1242 retval = ext4_ind_map_blocks(handle, inode, map, 0);
1257 bh, 0);
1258 } 1243 }
1259 up_read((&EXT4_I(inode)->i_data_sem)); 1244 up_read((&EXT4_I(inode)->i_data_sem));
1260 1245
1261 if (retval > 0 && buffer_mapped(bh)) { 1246 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
1262 int ret = check_block_validity(inode, "file system corruption", 1247 int ret = check_block_validity(inode, __func__, map);
1263 block, bh->b_blocknr, retval);
1264 if (ret != 0) 1248 if (ret != 0)
1265 return ret; 1249 return ret;
1266 } 1250 }
@@ -1276,7 +1260,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1276 * ext4_ext_get_block() returns th create = 0 1260 * ext4_ext_get_block() returns th create = 0
1277 * with buffer head unmapped. 1261 * with buffer head unmapped.
1278 */ 1262 */
1279 if (retval > 0 && buffer_mapped(bh)) 1263 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
1280 return retval; 1264 return retval;
1281 1265
1282 /* 1266 /*
@@ -1289,7 +1273,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1289 * of BH_Unwritten and BH_Mapped flags being simultaneously 1273 * of BH_Unwritten and BH_Mapped flags being simultaneously
1290 * set on the buffer_head. 1274 * set on the buffer_head.
1291 */ 1275 */
1292 clear_buffer_unwritten(bh); 1276 map->m_flags &= ~EXT4_MAP_UNWRITTEN;
1293 1277
1294 /* 1278 /*
1295 * New blocks allocate and/or writing to uninitialized extent 1279 * New blocks allocate and/or writing to uninitialized extent
@@ -1311,14 +1295,12 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1311 * We need to check for EXT4 here because migrate 1295 * We need to check for EXT4 here because migrate
1312 * could have changed the inode type in between 1296 * could have changed the inode type in between
1313 */ 1297 */
1314 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 1298 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
1315 retval = ext4_ext_get_blocks(handle, inode, block, max_blocks, 1299 retval = ext4_ext_map_blocks(handle, inode, map, flags);
1316 bh, flags);
1317 } else { 1300 } else {
1318 retval = ext4_ind_get_blocks(handle, inode, block, 1301 retval = ext4_ind_map_blocks(handle, inode, map, flags);
1319 max_blocks, bh, flags);
1320 1302
1321 if (retval > 0 && buffer_new(bh)) { 1303 if (retval > 0 && map->m_flags & EXT4_MAP_NEW) {
1322 /* 1304 /*
1323 * We allocated new blocks which will result in 1305 * We allocated new blocks which will result in
1324 * i_data's format changing. Force the migrate 1306 * i_data's format changing. Force the migrate
@@ -1341,10 +1323,10 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1341 EXT4_I(inode)->i_delalloc_reserved_flag = 0; 1323 EXT4_I(inode)->i_delalloc_reserved_flag = 0;
1342 1324
1343 up_write((&EXT4_I(inode)->i_data_sem)); 1325 up_write((&EXT4_I(inode)->i_data_sem));
1344 if (retval > 0 && buffer_mapped(bh)) { 1326 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
1345 int ret = check_block_validity(inode, "file system " 1327 int ret = check_block_validity(inode,
1346 "corruption after allocation", 1328 "ext4_map_blocks_after_alloc",
1347 block, bh->b_blocknr, retval); 1329 map);
1348 if (ret != 0) 1330 if (ret != 0)
1349 return ret; 1331 return ret;
1350 } 1332 }
@@ -1354,109 +1336,109 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1354/* Maximum number of blocks we map for direct IO at once. */ 1336/* Maximum number of blocks we map for direct IO at once. */
1355#define DIO_MAX_BLOCKS 4096 1337#define DIO_MAX_BLOCKS 4096
1356 1338
1357int ext4_get_block(struct inode *inode, sector_t iblock, 1339static int _ext4_get_block(struct inode *inode, sector_t iblock,
1358 struct buffer_head *bh_result, int create) 1340 struct buffer_head *bh, int flags)
1359{ 1341{
1360 handle_t *handle = ext4_journal_current_handle(); 1342 handle_t *handle = ext4_journal_current_handle();
1343 struct ext4_map_blocks map;
1361 int ret = 0, started = 0; 1344 int ret = 0, started = 0;
1362 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
1363 int dio_credits; 1345 int dio_credits;
1364 1346
1365 if (create && !handle) { 1347 map.m_lblk = iblock;
1348 map.m_len = bh->b_size >> inode->i_blkbits;
1349
1350 if (flags && !handle) {
1366 /* Direct IO write... */ 1351 /* Direct IO write... */
1367 if (max_blocks > DIO_MAX_BLOCKS) 1352 if (map.m_len > DIO_MAX_BLOCKS)
1368 max_blocks = DIO_MAX_BLOCKS; 1353 map.m_len = DIO_MAX_BLOCKS;
1369 dio_credits = ext4_chunk_trans_blocks(inode, max_blocks); 1354 dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
1370 handle = ext4_journal_start(inode, dio_credits); 1355 handle = ext4_journal_start(inode, dio_credits);
1371 if (IS_ERR(handle)) { 1356 if (IS_ERR(handle)) {
1372 ret = PTR_ERR(handle); 1357 ret = PTR_ERR(handle);
1373 goto out; 1358 return ret;
1374 } 1359 }
1375 started = 1; 1360 started = 1;
1376 } 1361 }
1377 1362
1378 ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result, 1363 ret = ext4_map_blocks(handle, inode, &map, flags);
1379 create ? EXT4_GET_BLOCKS_CREATE : 0);
1380 if (ret > 0) { 1364 if (ret > 0) {
1381 bh_result->b_size = (ret << inode->i_blkbits); 1365 map_bh(bh, inode->i_sb, map.m_pblk);
1366 bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
1367 bh->b_size = inode->i_sb->s_blocksize * map.m_len;
1382 ret = 0; 1368 ret = 0;
1383 } 1369 }
1384 if (started) 1370 if (started)
1385 ext4_journal_stop(handle); 1371 ext4_journal_stop(handle);
1386out:
1387 return ret; 1372 return ret;
1388} 1373}
1389 1374
1375int ext4_get_block(struct inode *inode, sector_t iblock,
1376 struct buffer_head *bh, int create)
1377{
1378 return _ext4_get_block(inode, iblock, bh,
1379 create ? EXT4_GET_BLOCKS_CREATE : 0);
1380}
1381
1390/* 1382/*
1391 * `handle' can be NULL if create is zero 1383 * `handle' can be NULL if create is zero
1392 */ 1384 */
1393struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, 1385struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
1394 ext4_lblk_t block, int create, int *errp) 1386 ext4_lblk_t block, int create, int *errp)
1395{ 1387{
1396 struct buffer_head dummy; 1388 struct ext4_map_blocks map;
1389 struct buffer_head *bh;
1397 int fatal = 0, err; 1390 int fatal = 0, err;
1398 int flags = 0;
1399 1391
1400 J_ASSERT(handle != NULL || create == 0); 1392 J_ASSERT(handle != NULL || create == 0);
1401 1393
1402 dummy.b_state = 0; 1394 map.m_lblk = block;
1403 dummy.b_blocknr = -1000; 1395 map.m_len = 1;
1404 buffer_trace_init(&dummy.b_history); 1396 err = ext4_map_blocks(handle, inode, &map,
1405 if (create) 1397 create ? EXT4_GET_BLOCKS_CREATE : 0);
1406 flags |= EXT4_GET_BLOCKS_CREATE; 1398
1407 err = ext4_get_blocks(handle, inode, block, 1, &dummy, flags); 1399 if (err < 0)
1408 /* 1400 *errp = err;
1409 * ext4_get_blocks() returns number of blocks mapped. 0 in 1401 if (err <= 0)
1410 * case of a HOLE. 1402 return NULL;
1411 */ 1403 *errp = 0;
1412 if (err > 0) { 1404
1413 if (err > 1) 1405 bh = sb_getblk(inode->i_sb, map.m_pblk);
1414 WARN_ON(1); 1406 if (!bh) {
1415 err = 0; 1407 *errp = -EIO;
1408 return NULL;
1416 } 1409 }
1417 *errp = err; 1410 if (map.m_flags & EXT4_MAP_NEW) {
1418 if (!err && buffer_mapped(&dummy)) { 1411 J_ASSERT(create != 0);
1419 struct buffer_head *bh; 1412 J_ASSERT(handle != NULL);
1420 bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
1421 if (!bh) {
1422 *errp = -EIO;
1423 goto err;
1424 }
1425 if (buffer_new(&dummy)) {
1426 J_ASSERT(create != 0);
1427 J_ASSERT(handle != NULL);
1428 1413
1429 /* 1414 /*
1430 * Now that we do not always journal data, we should 1415 * Now that we do not always journal data, we should
1431 * keep in mind whether this should always journal the 1416 * keep in mind whether this should always journal the
1432 * new buffer as metadata. For now, regular file 1417 * new buffer as metadata. For now, regular file
1433 * writes use ext4_get_block instead, so it's not a 1418 * writes use ext4_get_block instead, so it's not a
1434 * problem. 1419 * problem.
1435 */ 1420 */
1436 lock_buffer(bh); 1421 lock_buffer(bh);
1437 BUFFER_TRACE(bh, "call get_create_access"); 1422 BUFFER_TRACE(bh, "call get_create_access");
1438 fatal = ext4_journal_get_create_access(handle, bh); 1423 fatal = ext4_journal_get_create_access(handle, bh);
1439 if (!fatal && !buffer_uptodate(bh)) { 1424 if (!fatal && !buffer_uptodate(bh)) {
1440 memset(bh->b_data, 0, inode->i_sb->s_blocksize); 1425 memset(bh->b_data, 0, inode->i_sb->s_blocksize);
1441 set_buffer_uptodate(bh); 1426 set_buffer_uptodate(bh);
1442 }
1443 unlock_buffer(bh);
1444 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
1445 err = ext4_handle_dirty_metadata(handle, inode, bh);
1446 if (!fatal)
1447 fatal = err;
1448 } else {
1449 BUFFER_TRACE(bh, "not a new buffer");
1450 }
1451 if (fatal) {
1452 *errp = fatal;
1453 brelse(bh);
1454 bh = NULL;
1455 } 1427 }
1456 return bh; 1428 unlock_buffer(bh);
1429 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
1430 err = ext4_handle_dirty_metadata(handle, inode, bh);
1431 if (!fatal)
1432 fatal = err;
1433 } else {
1434 BUFFER_TRACE(bh, "not a new buffer");
1457 } 1435 }
1458err: 1436 if (fatal) {
1459 return NULL; 1437 *errp = fatal;
1438 brelse(bh);
1439 bh = NULL;
1440 }
1441 return bh;
1460} 1442}
1461 1443
1462struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, 1444struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
@@ -1859,7 +1841,7 @@ static int ext4_da_reserve_space(struct inode *inode, sector_t lblock)
1859 int retries = 0; 1841 int retries = 0;
1860 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1842 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1861 struct ext4_inode_info *ei = EXT4_I(inode); 1843 struct ext4_inode_info *ei = EXT4_I(inode);
1862 unsigned long md_needed, md_reserved; 1844 unsigned long md_needed;
1863 int ret; 1845 int ret;
1864 1846
1865 /* 1847 /*
@@ -1869,22 +1851,24 @@ static int ext4_da_reserve_space(struct inode *inode, sector_t lblock)
1869 */ 1851 */
1870repeat: 1852repeat:
1871 spin_lock(&ei->i_block_reservation_lock); 1853 spin_lock(&ei->i_block_reservation_lock);
1872 md_reserved = ei->i_reserved_meta_blocks;
1873 md_needed = ext4_calc_metadata_amount(inode, lblock); 1854 md_needed = ext4_calc_metadata_amount(inode, lblock);
1874 trace_ext4_da_reserve_space(inode, md_needed); 1855 trace_ext4_da_reserve_space(inode, md_needed);
1875 spin_unlock(&ei->i_block_reservation_lock); 1856 spin_unlock(&ei->i_block_reservation_lock);
1876 1857
1877 /* 1858 /*
1878 * Make quota reservation here to prevent quota overflow 1859 * We will charge metadata quota at writeout time; this saves
1879 * later. Real quota accounting is done at pages writeout 1860 * us from metadata over-estimation, though we may go over by
1880 * time. 1861 * a small amount in the end. Here we just reserve for data.
1881 */ 1862 */
1882 ret = dquot_reserve_block(inode, md_needed + 1); 1863 ret = dquot_reserve_block(inode, 1);
1883 if (ret) 1864 if (ret)
1884 return ret; 1865 return ret;
1885 1866 /*
1867 * We do still charge estimated metadata to the sb though;
1868 * we cannot afford to run out of free blocks.
1869 */
1886 if (ext4_claim_free_blocks(sbi, md_needed + 1)) { 1870 if (ext4_claim_free_blocks(sbi, md_needed + 1)) {
1887 dquot_release_reservation_block(inode, md_needed + 1); 1871 dquot_release_reservation_block(inode, 1);
1888 if (ext4_should_retry_alloc(inode->i_sb, &retries)) { 1872 if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
1889 yield(); 1873 yield();
1890 goto repeat; 1874 goto repeat;
@@ -1909,6 +1893,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
1909 1893
1910 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 1894 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1911 1895
1896 trace_ext4_da_release_space(inode, to_free);
1912 if (unlikely(to_free > ei->i_reserved_data_blocks)) { 1897 if (unlikely(to_free > ei->i_reserved_data_blocks)) {
1913 /* 1898 /*
1914 * if there aren't enough reserved blocks, then the 1899 * if there aren't enough reserved blocks, then the
@@ -1931,12 +1916,13 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
1931 * only when we have written all of the delayed 1916 * only when we have written all of the delayed
1932 * allocation blocks. 1917 * allocation blocks.
1933 */ 1918 */
1934 to_free += ei->i_reserved_meta_blocks; 1919 percpu_counter_sub(&sbi->s_dirtyblocks_counter,
1920 ei->i_reserved_meta_blocks);
1935 ei->i_reserved_meta_blocks = 0; 1921 ei->i_reserved_meta_blocks = 0;
1936 ei->i_da_metadata_calc_len = 0; 1922 ei->i_da_metadata_calc_len = 0;
1937 } 1923 }
1938 1924
1939 /* update fs dirty blocks counter */ 1925 /* update fs dirty data blocks counter */
1940 percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free); 1926 percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free);
1941 1927
1942 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1928 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
@@ -2041,28 +2027,23 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
2041/* 2027/*
2042 * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers 2028 * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers
2043 * 2029 *
2044 * @mpd->inode - inode to walk through
2045 * @exbh->b_blocknr - first block on a disk
2046 * @exbh->b_size - amount of space in bytes
2047 * @logical - first logical block to start assignment with
2048 *
2049 * the function goes through all passed space and put actual disk 2030 * the function goes through all passed space and put actual disk
2050 * block numbers into buffer heads, dropping BH_Delay and BH_Unwritten 2031 * block numbers into buffer heads, dropping BH_Delay and BH_Unwritten
2051 */ 2032 */
2052static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical, 2033static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd,
2053 struct buffer_head *exbh) 2034 struct ext4_map_blocks *map)
2054{ 2035{
2055 struct inode *inode = mpd->inode; 2036 struct inode *inode = mpd->inode;
2056 struct address_space *mapping = inode->i_mapping; 2037 struct address_space *mapping = inode->i_mapping;
2057 int blocks = exbh->b_size >> inode->i_blkbits; 2038 int blocks = map->m_len;
2058 sector_t pblock = exbh->b_blocknr, cur_logical; 2039 sector_t pblock = map->m_pblk, cur_logical;
2059 struct buffer_head *head, *bh; 2040 struct buffer_head *head, *bh;
2060 pgoff_t index, end; 2041 pgoff_t index, end;
2061 struct pagevec pvec; 2042 struct pagevec pvec;
2062 int nr_pages, i; 2043 int nr_pages, i;
2063 2044
2064 index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits); 2045 index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
2065 end = (logical + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits); 2046 end = (map->m_lblk + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
2066 cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); 2047 cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2067 2048
2068 pagevec_init(&pvec, 0); 2049 pagevec_init(&pvec, 0);
@@ -2089,17 +2070,16 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
2089 2070
2090 /* skip blocks out of the range */ 2071 /* skip blocks out of the range */
2091 do { 2072 do {
2092 if (cur_logical >= logical) 2073 if (cur_logical >= map->m_lblk)
2093 break; 2074 break;
2094 cur_logical++; 2075 cur_logical++;
2095 } while ((bh = bh->b_this_page) != head); 2076 } while ((bh = bh->b_this_page) != head);
2096 2077
2097 do { 2078 do {
2098 if (cur_logical >= logical + blocks) 2079 if (cur_logical >= map->m_lblk + blocks)
2099 break; 2080 break;
2100 2081
2101 if (buffer_delay(bh) || 2082 if (buffer_delay(bh) || buffer_unwritten(bh)) {
2102 buffer_unwritten(bh)) {
2103 2083
2104 BUG_ON(bh->b_bdev != inode->i_sb->s_bdev); 2084 BUG_ON(bh->b_bdev != inode->i_sb->s_bdev);
2105 2085
@@ -2118,7 +2098,7 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
2118 } else if (buffer_mapped(bh)) 2098 } else if (buffer_mapped(bh))
2119 BUG_ON(bh->b_blocknr != pblock); 2099 BUG_ON(bh->b_blocknr != pblock);
2120 2100
2121 if (buffer_uninit(exbh)) 2101 if (map->m_flags & EXT4_MAP_UNINIT)
2122 set_buffer_uninit(bh); 2102 set_buffer_uninit(bh);
2123 cur_logical++; 2103 cur_logical++;
2124 pblock++; 2104 pblock++;
@@ -2129,21 +2109,6 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
2129} 2109}
2130 2110
2131 2111
2132/*
2133 * __unmap_underlying_blocks - just a helper function to unmap
2134 * set of blocks described by @bh
2135 */
2136static inline void __unmap_underlying_blocks(struct inode *inode,
2137 struct buffer_head *bh)
2138{
2139 struct block_device *bdev = inode->i_sb->s_bdev;
2140 int blocks, i;
2141
2142 blocks = bh->b_size >> inode->i_blkbits;
2143 for (i = 0; i < blocks; i++)
2144 unmap_underlying_metadata(bdev, bh->b_blocknr + i);
2145}
2146
2147static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd, 2112static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
2148 sector_t logical, long blk_cnt) 2113 sector_t logical, long blk_cnt)
2149{ 2114{
@@ -2205,7 +2170,7 @@ static void ext4_print_free_blocks(struct inode *inode)
2205static int mpage_da_map_blocks(struct mpage_da_data *mpd) 2170static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2206{ 2171{
2207 int err, blks, get_blocks_flags; 2172 int err, blks, get_blocks_flags;
2208 struct buffer_head new; 2173 struct ext4_map_blocks map;
2209 sector_t next = mpd->b_blocknr; 2174 sector_t next = mpd->b_blocknr;
2210 unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits; 2175 unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
2211 loff_t disksize = EXT4_I(mpd->inode)->i_disksize; 2176 loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
@@ -2246,15 +2211,15 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2246 * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting 2211 * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting
2247 * variables are updated after the blocks have been allocated. 2212 * variables are updated after the blocks have been allocated.
2248 */ 2213 */
2249 new.b_state = 0; 2214 map.m_lblk = next;
2215 map.m_len = max_blocks;
2250 get_blocks_flags = EXT4_GET_BLOCKS_CREATE; 2216 get_blocks_flags = EXT4_GET_BLOCKS_CREATE;
2251 if (ext4_should_dioread_nolock(mpd->inode)) 2217 if (ext4_should_dioread_nolock(mpd->inode))
2252 get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; 2218 get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
2253 if (mpd->b_state & (1 << BH_Delay)) 2219 if (mpd->b_state & (1 << BH_Delay))
2254 get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; 2220 get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
2255 2221
2256 blks = ext4_get_blocks(handle, mpd->inode, next, max_blocks, 2222 blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);
2257 &new, get_blocks_flags);
2258 if (blks < 0) { 2223 if (blks < 0) {
2259 err = blks; 2224 err = blks;
2260 /* 2225 /*
@@ -2281,7 +2246,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2281 ext4_msg(mpd->inode->i_sb, KERN_CRIT, 2246 ext4_msg(mpd->inode->i_sb, KERN_CRIT,
2282 "delayed block allocation failed for inode %lu at " 2247 "delayed block allocation failed for inode %lu at "
2283 "logical offset %llu with max blocks %zd with " 2248 "logical offset %llu with max blocks %zd with "
2284 "error %d\n", mpd->inode->i_ino, 2249 "error %d", mpd->inode->i_ino,
2285 (unsigned long long) next, 2250 (unsigned long long) next,
2286 mpd->b_size >> mpd->inode->i_blkbits, err); 2251 mpd->b_size >> mpd->inode->i_blkbits, err);
2287 printk(KERN_CRIT "This should not happen!! " 2252 printk(KERN_CRIT "This should not happen!! "
@@ -2296,10 +2261,13 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2296 } 2261 }
2297 BUG_ON(blks == 0); 2262 BUG_ON(blks == 0);
2298 2263
2299 new.b_size = (blks << mpd->inode->i_blkbits); 2264 if (map.m_flags & EXT4_MAP_NEW) {
2265 struct block_device *bdev = mpd->inode->i_sb->s_bdev;
2266 int i;
2300 2267
2301 if (buffer_new(&new)) 2268 for (i = 0; i < map.m_len; i++)
2302 __unmap_underlying_blocks(mpd->inode, &new); 2269 unmap_underlying_metadata(bdev, map.m_pblk + i);
2270 }
2303 2271
2304 /* 2272 /*
2305 * If blocks are delayed marked, we need to 2273 * If blocks are delayed marked, we need to
@@ -2307,7 +2275,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2307 */ 2275 */
2308 if ((mpd->b_state & (1 << BH_Delay)) || 2276 if ((mpd->b_state & (1 << BH_Delay)) ||
2309 (mpd->b_state & (1 << BH_Unwritten))) 2277 (mpd->b_state & (1 << BH_Unwritten)))
2310 mpage_put_bnr_to_bhs(mpd, next, &new); 2278 mpage_put_bnr_to_bhs(mpd, &map);
2311 2279
2312 if (ext4_should_order_data(mpd->inode)) { 2280 if (ext4_should_order_data(mpd->inode)) {
2313 err = ext4_jbd2_file_inode(handle, mpd->inode); 2281 err = ext4_jbd2_file_inode(handle, mpd->inode);
@@ -2348,8 +2316,17 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
2348 sector_t next; 2316 sector_t next;
2349 int nrblocks = mpd->b_size >> mpd->inode->i_blkbits; 2317 int nrblocks = mpd->b_size >> mpd->inode->i_blkbits;
2350 2318
2319 /*
2320 * XXX Don't go larger than mballoc is willing to allocate
2321 * This is a stopgap solution. We eventually need to fold
2322 * mpage_da_submit_io() into this function and then call
2323 * ext4_get_blocks() multiple times in a loop
2324 */
2325 if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize)
2326 goto flush_it;
2327
2351 /* check if thereserved journal credits might overflow */ 2328 /* check if thereserved journal credits might overflow */
2352 if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) { 2329 if (!(ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS))) {
2353 if (nrblocks >= EXT4_MAX_TRANS_DATA) { 2330 if (nrblocks >= EXT4_MAX_TRANS_DATA) {
2354 /* 2331 /*
2355 * With non-extent format we are limited by the journal 2332 * With non-extent format we are limited by the journal
@@ -2422,17 +2399,6 @@ static int __mpage_da_writepage(struct page *page,
2422 struct buffer_head *bh, *head; 2399 struct buffer_head *bh, *head;
2423 sector_t logical; 2400 sector_t logical;
2424 2401
2425 if (mpd->io_done) {
2426 /*
2427 * Rest of the page in the page_vec
2428 * redirty then and skip then. We will
2429 * try to write them again after
2430 * starting a new transaction
2431 */
2432 redirty_page_for_writepage(wbc, page);
2433 unlock_page(page);
2434 return MPAGE_DA_EXTENT_TAIL;
2435 }
2436 /* 2402 /*
2437 * Can we merge this page to current extent? 2403 * Can we merge this page to current extent?
2438 */ 2404 */
@@ -2527,8 +2493,9 @@ static int __mpage_da_writepage(struct page *page,
2527 * initialized properly. 2493 * initialized properly.
2528 */ 2494 */
2529static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, 2495static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2530 struct buffer_head *bh_result, int create) 2496 struct buffer_head *bh, int create)
2531{ 2497{
2498 struct ext4_map_blocks map;
2532 int ret = 0; 2499 int ret = 0;
2533 sector_t invalid_block = ~((sector_t) 0xffff); 2500 sector_t invalid_block = ~((sector_t) 0xffff);
2534 2501
@@ -2536,16 +2503,22 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2536 invalid_block = ~0; 2503 invalid_block = ~0;
2537 2504
2538 BUG_ON(create == 0); 2505 BUG_ON(create == 0);
2539 BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize); 2506 BUG_ON(bh->b_size != inode->i_sb->s_blocksize);
2507
2508 map.m_lblk = iblock;
2509 map.m_len = 1;
2540 2510
2541 /* 2511 /*
2542 * first, we need to know whether the block is allocated already 2512 * first, we need to know whether the block is allocated already
2543 * preallocated blocks are unmapped but should treated 2513 * preallocated blocks are unmapped but should treated
2544 * the same as allocated blocks. 2514 * the same as allocated blocks.
2545 */ 2515 */
2546 ret = ext4_get_blocks(NULL, inode, iblock, 1, bh_result, 0); 2516 ret = ext4_map_blocks(NULL, inode, &map, 0);
2547 if ((ret == 0) && !buffer_delay(bh_result)) { 2517 if (ret < 0)
2548 /* the block isn't (pre)allocated yet, let's reserve space */ 2518 return ret;
2519 if (ret == 0) {
2520 if (buffer_delay(bh))
2521 return 0; /* Not sure this could or should happen */
2549 /* 2522 /*
2550 * XXX: __block_prepare_write() unmaps passed block, 2523 * XXX: __block_prepare_write() unmaps passed block,
2551 * is it OK? 2524 * is it OK?
@@ -2555,26 +2528,26 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2555 /* not enough space to reserve */ 2528 /* not enough space to reserve */
2556 return ret; 2529 return ret;
2557 2530
2558 map_bh(bh_result, inode->i_sb, invalid_block); 2531 map_bh(bh, inode->i_sb, invalid_block);
2559 set_buffer_new(bh_result); 2532 set_buffer_new(bh);
2560 set_buffer_delay(bh_result); 2533 set_buffer_delay(bh);
2561 } else if (ret > 0) { 2534 return 0;
2562 bh_result->b_size = (ret << inode->i_blkbits);
2563 if (buffer_unwritten(bh_result)) {
2564 /* A delayed write to unwritten bh should
2565 * be marked new and mapped. Mapped ensures
2566 * that we don't do get_block multiple times
2567 * when we write to the same offset and new
2568 * ensures that we do proper zero out for
2569 * partial write.
2570 */
2571 set_buffer_new(bh_result);
2572 set_buffer_mapped(bh_result);
2573 }
2574 ret = 0;
2575 } 2535 }
2576 2536
2577 return ret; 2537 map_bh(bh, inode->i_sb, map.m_pblk);
2538 bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
2539
2540 if (buffer_unwritten(bh)) {
2541 /* A delayed write to unwritten bh should be marked
2542 * new and mapped. Mapped ensures that we don't do
2543 * get_block multiple times when we write to the same
2544 * offset and new ensures that we do proper zero out
2545 * for partial write.
2546 */
2547 set_buffer_new(bh);
2548 set_buffer_mapped(bh);
2549 }
2550 return 0;
2578} 2551}
2579 2552
2580/* 2553/*
@@ -2596,21 +2569,8 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2596static int noalloc_get_block_write(struct inode *inode, sector_t iblock, 2569static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
2597 struct buffer_head *bh_result, int create) 2570 struct buffer_head *bh_result, int create)
2598{ 2571{
2599 int ret = 0;
2600 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
2601
2602 BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize); 2572 BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
2603 2573 return _ext4_get_block(inode, iblock, bh_result, 0);
2604 /*
2605 * we don't want to do block allocation in writepage
2606 * so call get_block_wrap with create = 0
2607 */
2608 ret = ext4_get_blocks(NULL, inode, iblock, max_blocks, bh_result, 0);
2609 if (ret > 0) {
2610 bh_result->b_size = (ret << inode->i_blkbits);
2611 ret = 0;
2612 }
2613 return ret;
2614} 2574}
2615 2575
2616static int bget_one(handle_t *handle, struct buffer_head *bh) 2576static int bget_one(handle_t *handle, struct buffer_head *bh)
@@ -2820,13 +2780,131 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
2820 * number of contiguous block. So we will limit 2780 * number of contiguous block. So we will limit
2821 * number of contiguous block to a sane value 2781 * number of contiguous block to a sane value
2822 */ 2782 */
2823 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) && 2783 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) &&
2824 (max_blocks > EXT4_MAX_TRANS_DATA)) 2784 (max_blocks > EXT4_MAX_TRANS_DATA))
2825 max_blocks = EXT4_MAX_TRANS_DATA; 2785 max_blocks = EXT4_MAX_TRANS_DATA;
2826 2786
2827 return ext4_chunk_trans_blocks(inode, max_blocks); 2787 return ext4_chunk_trans_blocks(inode, max_blocks);
2828} 2788}
2829 2789
2790/*
2791 * write_cache_pages_da - walk the list of dirty pages of the given
2792 * address space and call the callback function (which usually writes
2793 * the pages).
2794 *
2795 * This is a forked version of write_cache_pages(). Differences:
2796 * Range cyclic is ignored.
2797 * no_nrwrite_index_update is always presumed true
2798 */
2799static int write_cache_pages_da(struct address_space *mapping,
2800 struct writeback_control *wbc,
2801 struct mpage_da_data *mpd)
2802{
2803 int ret = 0;
2804 int done = 0;
2805 struct pagevec pvec;
2806 int nr_pages;
2807 pgoff_t index;
2808 pgoff_t end; /* Inclusive */
2809 long nr_to_write = wbc->nr_to_write;
2810
2811 pagevec_init(&pvec, 0);
2812 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2813 end = wbc->range_end >> PAGE_CACHE_SHIFT;
2814
2815 while (!done && (index <= end)) {
2816 int i;
2817
2818 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
2819 PAGECACHE_TAG_DIRTY,
2820 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
2821 if (nr_pages == 0)
2822 break;
2823
2824 for (i = 0; i < nr_pages; i++) {
2825 struct page *page = pvec.pages[i];
2826
2827 /*
2828 * At this point, the page may be truncated or
2829 * invalidated (changing page->mapping to NULL), or
2830 * even swizzled back from swapper_space to tmpfs file
2831 * mapping. However, page->index will not change
2832 * because we have a reference on the page.
2833 */
2834 if (page->index > end) {
2835 done = 1;
2836 break;
2837 }
2838
2839 lock_page(page);
2840
2841 /*
2842 * Page truncated or invalidated. We can freely skip it
2843 * then, even for data integrity operations: the page
2844 * has disappeared concurrently, so there could be no
2845 * real expectation of this data interity operation
2846 * even if there is now a new, dirty page at the same
2847 * pagecache address.
2848 */
2849 if (unlikely(page->mapping != mapping)) {
2850continue_unlock:
2851 unlock_page(page);
2852 continue;
2853 }
2854
2855 if (!PageDirty(page)) {
2856 /* someone wrote it for us */
2857 goto continue_unlock;
2858 }
2859
2860 if (PageWriteback(page)) {
2861 if (wbc->sync_mode != WB_SYNC_NONE)
2862 wait_on_page_writeback(page);
2863 else
2864 goto continue_unlock;
2865 }
2866
2867 BUG_ON(PageWriteback(page));
2868 if (!clear_page_dirty_for_io(page))
2869 goto continue_unlock;
2870
2871 ret = __mpage_da_writepage(page, wbc, mpd);
2872 if (unlikely(ret)) {
2873 if (ret == AOP_WRITEPAGE_ACTIVATE) {
2874 unlock_page(page);
2875 ret = 0;
2876 } else {
2877 done = 1;
2878 break;
2879 }
2880 }
2881
2882 if (nr_to_write > 0) {
2883 nr_to_write--;
2884 if (nr_to_write == 0 &&
2885 wbc->sync_mode == WB_SYNC_NONE) {
2886 /*
2887 * We stop writing back only if we are
2888 * not doing integrity sync. In case of
2889 * integrity sync we have to keep going
2890 * because someone may be concurrently
2891 * dirtying pages, and we might have
2892 * synced a lot of newly appeared dirty
2893 * pages, but have not synced all of the
2894 * old dirty pages.
2895 */
2896 done = 1;
2897 break;
2898 }
2899 }
2900 }
2901 pagevec_release(&pvec);
2902 cond_resched();
2903 }
2904 return ret;
2905}
2906
2907
2830static int ext4_da_writepages(struct address_space *mapping, 2908static int ext4_da_writepages(struct address_space *mapping,
2831 struct writeback_control *wbc) 2909 struct writeback_control *wbc)
2832{ 2910{
@@ -2835,7 +2913,6 @@ static int ext4_da_writepages(struct address_space *mapping,
2835 handle_t *handle = NULL; 2913 handle_t *handle = NULL;
2836 struct mpage_da_data mpd; 2914 struct mpage_da_data mpd;
2837 struct inode *inode = mapping->host; 2915 struct inode *inode = mapping->host;
2838 int no_nrwrite_index_update;
2839 int pages_written = 0; 2916 int pages_written = 0;
2840 long pages_skipped; 2917 long pages_skipped;
2841 unsigned int max_pages; 2918 unsigned int max_pages;
@@ -2915,12 +2992,6 @@ static int ext4_da_writepages(struct address_space *mapping,
2915 mpd.wbc = wbc; 2992 mpd.wbc = wbc;
2916 mpd.inode = mapping->host; 2993 mpd.inode = mapping->host;
2917 2994
2918 /*
2919 * we don't want write_cache_pages to update
2920 * nr_to_write and writeback_index
2921 */
2922 no_nrwrite_index_update = wbc->no_nrwrite_index_update;
2923 wbc->no_nrwrite_index_update = 1;
2924 pages_skipped = wbc->pages_skipped; 2995 pages_skipped = wbc->pages_skipped;
2925 2996
2926retry: 2997retry:
@@ -2940,7 +3011,7 @@ retry:
2940 if (IS_ERR(handle)) { 3011 if (IS_ERR(handle)) {
2941 ret = PTR_ERR(handle); 3012 ret = PTR_ERR(handle);
2942 ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " 3013 ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
2943 "%ld pages, ino %lu; err %d\n", __func__, 3014 "%ld pages, ino %lu; err %d", __func__,
2944 wbc->nr_to_write, inode->i_ino, ret); 3015 wbc->nr_to_write, inode->i_ino, ret);
2945 goto out_writepages; 3016 goto out_writepages;
2946 } 3017 }
@@ -2962,8 +3033,7 @@ retry:
2962 mpd.io_done = 0; 3033 mpd.io_done = 0;
2963 mpd.pages_written = 0; 3034 mpd.pages_written = 0;
2964 mpd.retval = 0; 3035 mpd.retval = 0;
2965 ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, 3036 ret = write_cache_pages_da(mapping, wbc, &mpd);
2966 &mpd);
2967 /* 3037 /*
2968 * If we have a contiguous extent of pages and we 3038 * If we have a contiguous extent of pages and we
2969 * haven't done the I/O yet, map the blocks and submit 3039 * haven't done the I/O yet, map the blocks and submit
@@ -3015,7 +3085,7 @@ retry:
3015 if (pages_skipped != wbc->pages_skipped) 3085 if (pages_skipped != wbc->pages_skipped)
3016 ext4_msg(inode->i_sb, KERN_CRIT, 3086 ext4_msg(inode->i_sb, KERN_CRIT,
3017 "This should not happen leaving %s " 3087 "This should not happen leaving %s "
3018 "with nr_to_write = %ld ret = %d\n", 3088 "with nr_to_write = %ld ret = %d",
3019 __func__, wbc->nr_to_write, ret); 3089 __func__, wbc->nr_to_write, ret);
3020 3090
3021 /* Update index */ 3091 /* Update index */
@@ -3029,8 +3099,6 @@ retry:
3029 mapping->writeback_index = index; 3099 mapping->writeback_index = index;
3030 3100
3031out_writepages: 3101out_writepages:
3032 if (!no_nrwrite_index_update)
3033 wbc->no_nrwrite_index_update = 0;
3034 wbc->nr_to_write -= nr_to_writebump; 3102 wbc->nr_to_write -= nr_to_writebump;
3035 wbc->range_start = range_start; 3103 wbc->range_start = range_start;
3036 trace_ext4_da_writepages_result(inode, wbc, ret, pages_written); 3104 trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
@@ -3075,7 +3143,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
3075 loff_t pos, unsigned len, unsigned flags, 3143 loff_t pos, unsigned len, unsigned flags,
3076 struct page **pagep, void **fsdata) 3144 struct page **pagep, void **fsdata)
3077{ 3145{
3078 int ret, retries = 0, quota_retries = 0; 3146 int ret, retries = 0;
3079 struct page *page; 3147 struct page *page;
3080 pgoff_t index; 3148 pgoff_t index;
3081 unsigned from, to; 3149 unsigned from, to;
@@ -3134,22 +3202,6 @@ retry:
3134 3202
3135 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 3203 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
3136 goto retry; 3204 goto retry;
3137
3138 if ((ret == -EDQUOT) &&
3139 EXT4_I(inode)->i_reserved_meta_blocks &&
3140 (quota_retries++ < 3)) {
3141 /*
3142 * Since we often over-estimate the number of meta
3143 * data blocks required, we may sometimes get a
3144 * spurios out of quota error even though there would
3145 * be enough space once we write the data blocks and
3146 * find out how many meta data blocks were _really_
3147 * required. So try forcing the inode write to see if
3148 * that helps.
3149 */
3150 write_inode_now(inode, (quota_retries == 3));
3151 goto retry;
3152 }
3153out: 3205out:
3154 return ret; 3206 return ret;
3155} 3207}
@@ -3545,46 +3597,18 @@ out:
3545 return ret; 3597 return ret;
3546} 3598}
3547 3599
3600/*
3601 * ext4_get_block used when preparing for a DIO write or buffer write.
3602 * We allocate an uinitialized extent if blocks haven't been allocated.
3603 * The extent will be converted to initialized after the IO is complete.
3604 */
3548static int ext4_get_block_write(struct inode *inode, sector_t iblock, 3605static int ext4_get_block_write(struct inode *inode, sector_t iblock,
3549 struct buffer_head *bh_result, int create) 3606 struct buffer_head *bh_result, int create)
3550{ 3607{
3551 handle_t *handle = ext4_journal_current_handle();
3552 int ret = 0;
3553 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
3554 int dio_credits;
3555 int started = 0;
3556
3557 ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n", 3608 ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
3558 inode->i_ino, create); 3609 inode->i_ino, create);
3559 /* 3610 return _ext4_get_block(inode, iblock, bh_result,
3560 * ext4_get_block in prepare for a DIO write or buffer write. 3611 EXT4_GET_BLOCKS_IO_CREATE_EXT);
3561 * We allocate an uinitialized extent if blocks haven't been allocated.
3562 * The extent will be converted to initialized after IO complete.
3563 */
3564 create = EXT4_GET_BLOCKS_IO_CREATE_EXT;
3565
3566 if (!handle) {
3567 if (max_blocks > DIO_MAX_BLOCKS)
3568 max_blocks = DIO_MAX_BLOCKS;
3569 dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
3570 handle = ext4_journal_start(inode, dio_credits);
3571 if (IS_ERR(handle)) {
3572 ret = PTR_ERR(handle);
3573 goto out;
3574 }
3575 started = 1;
3576 }
3577
3578 ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
3579 create);
3580 if (ret > 0) {
3581 bh_result->b_size = (ret << inode->i_blkbits);
3582 ret = 0;
3583 }
3584 if (started)
3585 ext4_journal_stop(handle);
3586out:
3587 return ret;
3588} 3612}
3589 3613
3590static void dump_completed_IO(struct inode * inode) 3614static void dump_completed_IO(struct inode * inode)
@@ -3972,7 +3996,7 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
3972 struct file *file = iocb->ki_filp; 3996 struct file *file = iocb->ki_filp;
3973 struct inode *inode = file->f_mapping->host; 3997 struct inode *inode = file->f_mapping->host;
3974 3998
3975 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) 3999 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
3976 return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs); 4000 return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
3977 4001
3978 return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); 4002 return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
@@ -4301,10 +4325,9 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
4301 4325
4302 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free, 4326 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
4303 count)) { 4327 count)) {
4304 ext4_error(inode->i_sb, "inode #%lu: " 4328 EXT4_ERROR_INODE(inode, "attempt to clear invalid "
4305 "attempt to clear blocks %llu len %lu, invalid", 4329 "blocks %llu len %lu",
4306 inode->i_ino, (unsigned long long) block_to_free, 4330 (unsigned long long) block_to_free, count);
4307 count);
4308 return 1; 4331 return 1;
4309 } 4332 }
4310 4333
@@ -4409,11 +4432,10 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
4409 if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh)) 4432 if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh))
4410 ext4_handle_dirty_metadata(handle, inode, this_bh); 4433 ext4_handle_dirty_metadata(handle, inode, this_bh);
4411 else 4434 else
4412 ext4_error(inode->i_sb, 4435 EXT4_ERROR_INODE(inode,
4413 "circular indirect block detected, " 4436 "circular indirect block detected at "
4414 "inode=%lu, block=%llu", 4437 "block %llu",
4415 inode->i_ino, 4438 (unsigned long long) this_bh->b_blocknr);
4416 (unsigned long long) this_bh->b_blocknr);
4417 } 4439 }
4418} 4440}
4419 4441
@@ -4451,11 +4473,10 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
4451 4473
4452 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), 4474 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb),
4453 nr, 1)) { 4475 nr, 1)) {
4454 ext4_error(inode->i_sb, 4476 EXT4_ERROR_INODE(inode,
4455 "indirect mapped block in inode " 4477 "invalid indirect mapped "
4456 "#%lu invalid (level %d, blk #%lu)", 4478 "block %lu (level %d)",
4457 inode->i_ino, depth, 4479 (unsigned long) nr, depth);
4458 (unsigned long) nr);
4459 break; 4480 break;
4460 } 4481 }
4461 4482
@@ -4467,9 +4488,9 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
4467 * (should be rare). 4488 * (should be rare).
4468 */ 4489 */
4469 if (!bh) { 4490 if (!bh) {
4470 ext4_error(inode->i_sb, 4491 EXT4_ERROR_INODE(inode,
4471 "Read failure, inode=%lu, block=%llu", 4492 "Read failure block=%llu",
4472 inode->i_ino, nr); 4493 (unsigned long long) nr);
4473 continue; 4494 continue;
4474 } 4495 }
4475 4496
@@ -4611,12 +4632,12 @@ void ext4_truncate(struct inode *inode)
4611 if (!ext4_can_truncate(inode)) 4632 if (!ext4_can_truncate(inode))
4612 return; 4633 return;
4613 4634
4614 EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL; 4635 ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
4615 4636
4616 if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) 4637 if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
4617 ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); 4638 ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
4618 4639
4619 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 4640 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
4620 ext4_ext_truncate(inode); 4641 ext4_ext_truncate(inode);
4621 return; 4642 return;
4622 } 4643 }
@@ -4784,8 +4805,8 @@ static int __ext4_get_inode_loc(struct inode *inode,
4784 4805
4785 bh = sb_getblk(sb, block); 4806 bh = sb_getblk(sb, block);
4786 if (!bh) { 4807 if (!bh) {
4787 ext4_error(sb, "unable to read inode block - " 4808 EXT4_ERROR_INODE(inode, "unable to read inode block - "
4788 "inode=%lu, block=%llu", inode->i_ino, block); 4809 "block %llu", block);
4789 return -EIO; 4810 return -EIO;
4790 } 4811 }
4791 if (!buffer_uptodate(bh)) { 4812 if (!buffer_uptodate(bh)) {
@@ -4883,8 +4904,8 @@ make_io:
4883 submit_bh(READ_META, bh); 4904 submit_bh(READ_META, bh);
4884 wait_on_buffer(bh); 4905 wait_on_buffer(bh);
4885 if (!buffer_uptodate(bh)) { 4906 if (!buffer_uptodate(bh)) {
4886 ext4_error(sb, "unable to read inode block - inode=%lu," 4907 EXT4_ERROR_INODE(inode, "unable to read inode "
4887 " block=%llu", inode->i_ino, block); 4908 "block %llu", block);
4888 brelse(bh); 4909 brelse(bh);
4889 return -EIO; 4910 return -EIO;
4890 } 4911 }
@@ -5095,8 +5116,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
5095 ret = 0; 5116 ret = 0;
5096 if (ei->i_file_acl && 5117 if (ei->i_file_acl &&
5097 !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) { 5118 !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
5098 ext4_error(sb, "bad extended attribute block %llu inode #%lu", 5119 EXT4_ERROR_INODE(inode, "bad extended attribute block %llu",
5099 ei->i_file_acl, inode->i_ino); 5120 ei->i_file_acl);
5100 ret = -EIO; 5121 ret = -EIO;
5101 goto bad_inode; 5122 goto bad_inode;
5102 } else if (ei->i_flags & EXT4_EXTENTS_FL) { 5123 } else if (ei->i_flags & EXT4_EXTENTS_FL) {
@@ -5141,8 +5162,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
5141 new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); 5162 new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
5142 } else { 5163 } else {
5143 ret = -EIO; 5164 ret = -EIO;
5144 ext4_error(inode->i_sb, "bogus i_mode (%o) for inode=%lu", 5165 EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode);
5145 inode->i_mode, inode->i_ino);
5146 goto bad_inode; 5166 goto bad_inode;
5147 } 5167 }
5148 brelse(iloc.bh); 5168 brelse(iloc.bh);
@@ -5374,17 +5394,18 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
5374 } else { 5394 } else {
5375 struct ext4_iloc iloc; 5395 struct ext4_iloc iloc;
5376 5396
5377 err = ext4_get_inode_loc(inode, &iloc); 5397 err = __ext4_get_inode_loc(inode, &iloc, 0);
5378 if (err) 5398 if (err)
5379 return err; 5399 return err;
5380 if (wbc->sync_mode == WB_SYNC_ALL) 5400 if (wbc->sync_mode == WB_SYNC_ALL)
5381 sync_dirty_buffer(iloc.bh); 5401 sync_dirty_buffer(iloc.bh);
5382 if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { 5402 if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
5383 ext4_error(inode->i_sb, "IO error syncing inode, " 5403 EXT4_ERROR_INODE(inode,
5384 "inode=%lu, block=%llu", inode->i_ino, 5404 "IO error syncing inode (block=%llu)",
5385 (unsigned long long)iloc.bh->b_blocknr); 5405 (unsigned long long) iloc.bh->b_blocknr);
5386 err = -EIO; 5406 err = -EIO;
5387 } 5407 }
5408 brelse(iloc.bh);
5388 } 5409 }
5389 return err; 5410 return err;
5390} 5411}
@@ -5423,7 +5444,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5423 if (error) 5444 if (error)
5424 return error; 5445 return error;
5425 5446
5426 if (ia_valid & ATTR_SIZE) 5447 if (is_quota_modification(inode, attr))
5427 dquot_initialize(inode); 5448 dquot_initialize(inode);
5428 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || 5449 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
5429 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { 5450 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
@@ -5453,7 +5474,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5453 } 5474 }
5454 5475
5455 if (attr->ia_valid & ATTR_SIZE) { 5476 if (attr->ia_valid & ATTR_SIZE) {
5456 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) { 5477 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
5457 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 5478 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
5458 5479
5459 if (attr->ia_size > sbi->s_bitmap_maxbytes) { 5480 if (attr->ia_size > sbi->s_bitmap_maxbytes) {
@@ -5466,7 +5487,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5466 if (S_ISREG(inode->i_mode) && 5487 if (S_ISREG(inode->i_mode) &&
5467 attr->ia_valid & ATTR_SIZE && 5488 attr->ia_valid & ATTR_SIZE &&
5468 (attr->ia_size < inode->i_size || 5489 (attr->ia_size < inode->i_size ||
5469 (EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))) { 5490 (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))) {
5470 handle_t *handle; 5491 handle_t *handle;
5471 5492
5472 handle = ext4_journal_start(inode, 3); 5493 handle = ext4_journal_start(inode, 3);
@@ -5498,7 +5519,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5498 } 5519 }
5499 } 5520 }
5500 /* ext4_truncate will clear the flag */ 5521 /* ext4_truncate will clear the flag */
5501 if ((EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL)) 5522 if ((ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))
5502 ext4_truncate(inode); 5523 ext4_truncate(inode);
5503 } 5524 }
5504 5525
@@ -5574,7 +5595,7 @@ static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks,
5574 5595
5575static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) 5596static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
5576{ 5597{
5577 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) 5598 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
5578 return ext4_indirect_trans_blocks(inode, nrblocks, chunk); 5599 return ext4_indirect_trans_blocks(inode, nrblocks, chunk);
5579 return ext4_ext_index_trans_blocks(inode, nrblocks, chunk); 5600 return ext4_ext_index_trans_blocks(inode, nrblocks, chunk);
5580} 5601}
@@ -5909,9 +5930,9 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
5909 */ 5930 */
5910 5931
5911 if (val) 5932 if (val)
5912 EXT4_I(inode)->i_flags |= EXT4_JOURNAL_DATA_FL; 5933 ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
5913 else 5934 else
5914 EXT4_I(inode)->i_flags &= ~EXT4_JOURNAL_DATA_FL; 5935 ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
5915 ext4_set_aops(inode); 5936 ext4_set_aops(inode);
5916 5937
5917 jbd2_journal_unlock_updates(journal); 5938 jbd2_journal_unlock_updates(journal);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 016d0249294f..bf5ae883b1bd 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -258,7 +258,7 @@ setversion_out:
258 if (me.moved_len > 0) 258 if (me.moved_len > 0)
259 file_remove_suid(donor_filp); 259 file_remove_suid(donor_filp);
260 260
261 if (copy_to_user((struct move_extent __user *)arg, 261 if (copy_to_user((struct move_extent __user *)arg,
262 &me, sizeof(me))) 262 &me, sizeof(me)))
263 err = -EFAULT; 263 err = -EFAULT;
264mext_out: 264mext_out:
@@ -373,7 +373,30 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
373 case EXT4_IOC32_SETRSVSZ: 373 case EXT4_IOC32_SETRSVSZ:
374 cmd = EXT4_IOC_SETRSVSZ; 374 cmd = EXT4_IOC_SETRSVSZ;
375 break; 375 break;
376 case EXT4_IOC_GROUP_ADD: 376 case EXT4_IOC32_GROUP_ADD: {
377 struct compat_ext4_new_group_input __user *uinput;
378 struct ext4_new_group_input input;
379 mm_segment_t old_fs;
380 int err;
381
382 uinput = compat_ptr(arg);
383 err = get_user(input.group, &uinput->group);
384 err |= get_user(input.block_bitmap, &uinput->block_bitmap);
385 err |= get_user(input.inode_bitmap, &uinput->inode_bitmap);
386 err |= get_user(input.inode_table, &uinput->inode_table);
387 err |= get_user(input.blocks_count, &uinput->blocks_count);
388 err |= get_user(input.reserved_blocks,
389 &uinput->reserved_blocks);
390 if (err)
391 return -EFAULT;
392 old_fs = get_fs();
393 set_fs(KERNEL_DS);
394 err = ext4_ioctl(file, EXT4_IOC_GROUP_ADD,
395 (unsigned long) &input);
396 set_fs(old_fs);
397 return err;
398 }
399 case EXT4_IOC_MOVE_EXT:
377 break; 400 break;
378 default: 401 default:
379 return -ENOIOCTLCMD; 402 return -ENOIOCTLCMD;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 54df209d2eed..12b3bc026a68 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -23,6 +23,7 @@
23 23
24#include "mballoc.h" 24#include "mballoc.h"
25#include <linux/debugfs.h> 25#include <linux/debugfs.h>
26#include <linux/slab.h>
26#include <trace/events/ext4.h> 27#include <trace/events/ext4.h>
27 28
28/* 29/*
@@ -657,6 +658,27 @@ static void ext4_mb_mark_free_simple(struct super_block *sb,
657 } 658 }
658} 659}
659 660
661/*
662 * Cache the order of the largest free extent we have available in this block
663 * group.
664 */
665static void
666mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
667{
668 int i;
669 int bits;
670
671 grp->bb_largest_free_order = -1; /* uninit */
672
673 bits = sb->s_blocksize_bits + 1;
674 for (i = bits; i >= 0; i--) {
675 if (grp->bb_counters[i] > 0) {
676 grp->bb_largest_free_order = i;
677 break;
678 }
679 }
680}
681
660static noinline_for_stack 682static noinline_for_stack
661void ext4_mb_generate_buddy(struct super_block *sb, 683void ext4_mb_generate_buddy(struct super_block *sb,
662 void *buddy, void *bitmap, ext4_group_t group) 684 void *buddy, void *bitmap, ext4_group_t group)
@@ -699,6 +721,7 @@ void ext4_mb_generate_buddy(struct super_block *sb,
699 */ 721 */
700 grp->bb_free = free; 722 grp->bb_free = free;
701 } 723 }
724 mb_set_largest_free_order(sb, grp);
702 725
703 clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); 726 clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
704 727
@@ -724,6 +747,9 @@ void ext4_mb_generate_buddy(struct super_block *sb,
724 * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize) blocks. 747 * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize) blocks.
725 * So it can have information regarding groups_per_page which 748 * So it can have information regarding groups_per_page which
726 * is blocks_per_page/2 749 * is blocks_per_page/2
750 *
751 * Locking note: This routine takes the block group lock of all groups
752 * for this page; do not hold this lock when calling this routine!
727 */ 753 */
728 754
729static int ext4_mb_init_cache(struct page *page, char *incore) 755static int ext4_mb_init_cache(struct page *page, char *incore)
@@ -864,6 +890,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
864 BUG_ON(incore == NULL); 890 BUG_ON(incore == NULL);
865 mb_debug(1, "put buddy for group %u in page %lu/%x\n", 891 mb_debug(1, "put buddy for group %u in page %lu/%x\n",
866 group, page->index, i * blocksize); 892 group, page->index, i * blocksize);
893 trace_ext4_mb_buddy_bitmap_load(sb, group);
867 grinfo = ext4_get_group_info(sb, group); 894 grinfo = ext4_get_group_info(sb, group);
868 grinfo->bb_fragments = 0; 895 grinfo->bb_fragments = 0;
869 memset(grinfo->bb_counters, 0, 896 memset(grinfo->bb_counters, 0,
@@ -881,6 +908,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
881 BUG_ON(incore != NULL); 908 BUG_ON(incore != NULL);
882 mb_debug(1, "put bitmap for group %u in page %lu/%x\n", 909 mb_debug(1, "put bitmap for group %u in page %lu/%x\n",
883 group, page->index, i * blocksize); 910 group, page->index, i * blocksize);
911 trace_ext4_mb_bitmap_load(sb, group);
884 912
885 /* see comments in ext4_mb_put_pa() */ 913 /* see comments in ext4_mb_put_pa() */
886 ext4_lock_group(sb, group); 914 ext4_lock_group(sb, group);
@@ -909,6 +937,11 @@ out:
909 return err; 937 return err;
910} 938}
911 939
940/*
941 * Locking note: This routine calls ext4_mb_init_cache(), which takes the
942 * block group lock of all groups for this page; do not hold the BG lock when
943 * calling this routine!
944 */
912static noinline_for_stack 945static noinline_for_stack
913int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) 946int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
914{ 947{
@@ -1003,6 +1036,11 @@ err:
1003 return ret; 1036 return ret;
1004} 1037}
1005 1038
1039/*
1040 * Locking note: This routine calls ext4_mb_init_cache(), which takes the
1041 * block group lock of all groups for this page; do not hold the BG lock when
1042 * calling this routine!
1043 */
1006static noinline_for_stack int 1044static noinline_for_stack int
1007ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, 1045ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1008 struct ext4_buddy *e4b) 1046 struct ext4_buddy *e4b)
@@ -1149,7 +1187,7 @@ err:
1149 return ret; 1187 return ret;
1150} 1188}
1151 1189
1152static void ext4_mb_release_desc(struct ext4_buddy *e4b) 1190static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
1153{ 1191{
1154 if (e4b->bd_bitmap_page) 1192 if (e4b->bd_bitmap_page)
1155 page_cache_release(e4b->bd_bitmap_page); 1193 page_cache_release(e4b->bd_bitmap_page);
@@ -1298,6 +1336,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1298 buddy = buddy2; 1336 buddy = buddy2;
1299 } while (1); 1337 } while (1);
1300 } 1338 }
1339 mb_set_largest_free_order(sb, e4b->bd_info);
1301 mb_check_buddy(e4b); 1340 mb_check_buddy(e4b);
1302} 1341}
1303 1342
@@ -1426,6 +1465,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
1426 e4b->bd_info->bb_counters[ord]++; 1465 e4b->bd_info->bb_counters[ord]++;
1427 e4b->bd_info->bb_counters[ord]++; 1466 e4b->bd_info->bb_counters[ord]++;
1428 } 1467 }
1468 mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
1429 1469
1430 mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0); 1470 mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
1431 mb_check_buddy(e4b); 1471 mb_check_buddy(e4b);
@@ -1616,7 +1656,7 @@ int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
1616 } 1656 }
1617 1657
1618 ext4_unlock_group(ac->ac_sb, group); 1658 ext4_unlock_group(ac->ac_sb, group);
1619 ext4_mb_release_desc(e4b); 1659 ext4_mb_unload_buddy(e4b);
1620 1660
1621 return 0; 1661 return 0;
1622} 1662}
@@ -1671,7 +1711,7 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
1671 ext4_mb_use_best_found(ac, e4b); 1711 ext4_mb_use_best_found(ac, e4b);
1672 } 1712 }
1673 ext4_unlock_group(ac->ac_sb, group); 1713 ext4_unlock_group(ac->ac_sb, group);
1674 ext4_mb_release_desc(e4b); 1714 ext4_mb_unload_buddy(e4b);
1675 1715
1676 return 0; 1716 return 0;
1677} 1717}
@@ -1820,16 +1860,22 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
1820 } 1860 }
1821} 1861}
1822 1862
1863/* This is now called BEFORE we load the buddy bitmap. */
1823static int ext4_mb_good_group(struct ext4_allocation_context *ac, 1864static int ext4_mb_good_group(struct ext4_allocation_context *ac,
1824 ext4_group_t group, int cr) 1865 ext4_group_t group, int cr)
1825{ 1866{
1826 unsigned free, fragments; 1867 unsigned free, fragments;
1827 unsigned i, bits;
1828 int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb)); 1868 int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
1829 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); 1869 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
1830 1870
1831 BUG_ON(cr < 0 || cr >= 4); 1871 BUG_ON(cr < 0 || cr >= 4);
1832 BUG_ON(EXT4_MB_GRP_NEED_INIT(grp)); 1872
1873 /* We only do this if the grp has never been initialized */
1874 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
1875 int ret = ext4_mb_init_group(ac->ac_sb, group);
1876 if (ret)
1877 return 0;
1878 }
1833 1879
1834 free = grp->bb_free; 1880 free = grp->bb_free;
1835 fragments = grp->bb_fragments; 1881 fragments = grp->bb_fragments;
@@ -1842,17 +1888,16 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
1842 case 0: 1888 case 0:
1843 BUG_ON(ac->ac_2order == 0); 1889 BUG_ON(ac->ac_2order == 0);
1844 1890
1891 if (grp->bb_largest_free_order < ac->ac_2order)
1892 return 0;
1893
1845 /* Avoid using the first bg of a flexgroup for data files */ 1894 /* Avoid using the first bg of a flexgroup for data files */
1846 if ((ac->ac_flags & EXT4_MB_HINT_DATA) && 1895 if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
1847 (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) && 1896 (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
1848 ((group % flex_size) == 0)) 1897 ((group % flex_size) == 0))
1849 return 0; 1898 return 0;
1850 1899
1851 bits = ac->ac_sb->s_blocksize_bits + 1; 1900 return 1;
1852 for (i = ac->ac_2order; i <= bits; i++)
1853 if (grp->bb_counters[i] > 0)
1854 return 1;
1855 break;
1856 case 1: 1901 case 1:
1857 if ((free / fragments) >= ac->ac_g_ex.fe_len) 1902 if ((free / fragments) >= ac->ac_g_ex.fe_len)
1858 return 1; 1903 return 1;
@@ -1963,7 +2008,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
1963 sbi = EXT4_SB(sb); 2008 sbi = EXT4_SB(sb);
1964 ngroups = ext4_get_groups_count(sb); 2009 ngroups = ext4_get_groups_count(sb);
1965 /* non-extent files are limited to low blocks/groups */ 2010 /* non-extent files are limited to low blocks/groups */
1966 if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL)) 2011 if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
1967 ngroups = sbi->s_blockfile_groups; 2012 ngroups = sbi->s_blockfile_groups;
1968 2013
1969 BUG_ON(ac->ac_status == AC_STATUS_FOUND); 2014 BUG_ON(ac->ac_status == AC_STATUS_FOUND);
@@ -2023,15 +2068,11 @@ repeat:
2023 group = ac->ac_g_ex.fe_group; 2068 group = ac->ac_g_ex.fe_group;
2024 2069
2025 for (i = 0; i < ngroups; group++, i++) { 2070 for (i = 0; i < ngroups; group++, i++) {
2026 struct ext4_group_info *grp;
2027 struct ext4_group_desc *desc;
2028
2029 if (group == ngroups) 2071 if (group == ngroups)
2030 group = 0; 2072 group = 0;
2031 2073
2032 /* quick check to skip empty groups */ 2074 /* This now checks without needing the buddy page */
2033 grp = ext4_get_group_info(sb, group); 2075 if (!ext4_mb_good_group(ac, group, cr))
2034 if (grp->bb_free == 0)
2035 continue; 2076 continue;
2036 2077
2037 err = ext4_mb_load_buddy(sb, group, &e4b); 2078 err = ext4_mb_load_buddy(sb, group, &e4b);
@@ -2039,15 +2080,18 @@ repeat:
2039 goto out; 2080 goto out;
2040 2081
2041 ext4_lock_group(sb, group); 2082 ext4_lock_group(sb, group);
2083
2084 /*
2085 * We need to check again after locking the
2086 * block group
2087 */
2042 if (!ext4_mb_good_group(ac, group, cr)) { 2088 if (!ext4_mb_good_group(ac, group, cr)) {
2043 /* someone did allocation from this group */
2044 ext4_unlock_group(sb, group); 2089 ext4_unlock_group(sb, group);
2045 ext4_mb_release_desc(&e4b); 2090 ext4_mb_unload_buddy(&e4b);
2046 continue; 2091 continue;
2047 } 2092 }
2048 2093
2049 ac->ac_groups_scanned++; 2094 ac->ac_groups_scanned++;
2050 desc = ext4_get_group_desc(sb, group, NULL);
2051 if (cr == 0) 2095 if (cr == 0)
2052 ext4_mb_simple_scan_group(ac, &e4b); 2096 ext4_mb_simple_scan_group(ac, &e4b);
2053 else if (cr == 1 && 2097 else if (cr == 1 &&
@@ -2057,7 +2101,7 @@ repeat:
2057 ext4_mb_complex_scan_group(ac, &e4b); 2101 ext4_mb_complex_scan_group(ac, &e4b);
2058 2102
2059 ext4_unlock_group(sb, group); 2103 ext4_unlock_group(sb, group);
2060 ext4_mb_release_desc(&e4b); 2104 ext4_mb_unload_buddy(&e4b);
2061 2105
2062 if (ac->ac_status != AC_STATUS_CONTINUE) 2106 if (ac->ac_status != AC_STATUS_CONTINUE)
2063 break; 2107 break;
@@ -2147,7 +2191,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
2147 ext4_lock_group(sb, group); 2191 ext4_lock_group(sb, group);
2148 memcpy(&sg, ext4_get_group_info(sb, group), i); 2192 memcpy(&sg, ext4_get_group_info(sb, group), i);
2149 ext4_unlock_group(sb, group); 2193 ext4_unlock_group(sb, group);
2150 ext4_mb_release_desc(&e4b); 2194 ext4_mb_unload_buddy(&e4b);
2151 2195
2152 seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free, 2196 seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
2153 sg.info.bb_fragments, sg.info.bb_first_free); 2197 sg.info.bb_fragments, sg.info.bb_first_free);
@@ -2254,6 +2298,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2254 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); 2298 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
2255 init_rwsem(&meta_group_info[i]->alloc_sem); 2299 init_rwsem(&meta_group_info[i]->alloc_sem);
2256 meta_group_info[i]->bb_free_root = RB_ROOT; 2300 meta_group_info[i]->bb_free_root = RB_ROOT;
2301 meta_group_info[i]->bb_largest_free_order = -1; /* uninit */
2257 2302
2258#ifdef DOUBLE_CHECK 2303#ifdef DOUBLE_CHECK
2259 { 2304 {
@@ -2534,6 +2579,23 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2534 mb_debug(1, "gonna free %u blocks in group %u (0x%p):", 2579 mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
2535 entry->count, entry->group, entry); 2580 entry->count, entry->group, entry);
2536 2581
2582 if (test_opt(sb, DISCARD)) {
2583 int ret;
2584 ext4_fsblk_t discard_block;
2585
2586 discard_block = entry->start_blk +
2587 ext4_group_first_block_no(sb, entry->group);
2588 trace_ext4_discard_blocks(sb,
2589 (unsigned long long)discard_block,
2590 entry->count);
2591 ret = sb_issue_discard(sb, discard_block, entry->count);
2592 if (ret == EOPNOTSUPP) {
2593 ext4_warning(sb,
2594 "discard not supported, disabling");
2595 clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD);
2596 }
2597 }
2598
2537 err = ext4_mb_load_buddy(sb, entry->group, &e4b); 2599 err = ext4_mb_load_buddy(sb, entry->group, &e4b);
2538 /* we expect to find existing buddy because it's pinned */ 2600 /* we expect to find existing buddy because it's pinned */
2539 BUG_ON(err != 0); 2601 BUG_ON(err != 0);
@@ -2555,18 +2617,8 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2555 page_cache_release(e4b.bd_bitmap_page); 2617 page_cache_release(e4b.bd_bitmap_page);
2556 } 2618 }
2557 ext4_unlock_group(sb, entry->group); 2619 ext4_unlock_group(sb, entry->group);
2558 if (test_opt(sb, DISCARD)) {
2559 ext4_fsblk_t discard_block;
2560
2561 discard_block = entry->start_blk +
2562 ext4_group_first_block_no(sb, entry->group);
2563 trace_ext4_discard_blocks(sb,
2564 (unsigned long long)discard_block,
2565 entry->count);
2566 sb_issue_discard(sb, discard_block, entry->count);
2567 }
2568 kmem_cache_free(ext4_free_ext_cachep, entry); 2620 kmem_cache_free(ext4_free_ext_cachep, entry);
2569 ext4_mb_release_desc(&e4b); 2621 ext4_mb_unload_buddy(&e4b);
2570 } 2622 }
2571 2623
2572 mb_debug(1, "freed %u blocks in %u structures\n", count, count2); 2624 mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
@@ -2639,7 +2691,7 @@ int __init init_ext4_mballoc(void)
2639 2691
2640void exit_ext4_mballoc(void) 2692void exit_ext4_mballoc(void)
2641{ 2693{
2642 /* 2694 /*
2643 * Wait for completion of call_rcu()'s on ext4_pspace_cachep 2695 * Wait for completion of call_rcu()'s on ext4_pspace_cachep
2644 * before destroying the slab cache. 2696 * before destroying the slab cache.
2645 */ 2697 */
@@ -2979,7 +3031,7 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
2979 if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) { 3031 if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) {
2980 atomic_inc(&sbi->s_bal_reqs); 3032 atomic_inc(&sbi->s_bal_reqs);
2981 atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated); 3033 atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated);
2982 if (ac->ac_o_ex.fe_len >= ac->ac_g_ex.fe_len) 3034 if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len)
2983 atomic_inc(&sbi->s_bal_success); 3035 atomic_inc(&sbi->s_bal_success);
2984 atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned); 3036 atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned);
2985 if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && 3037 if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
@@ -3121,7 +3173,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
3121 continue; 3173 continue;
3122 3174
3123 /* non-extent files can't have physical blocks past 2^32 */ 3175 /* non-extent files can't have physical blocks past 2^32 */
3124 if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL) && 3176 if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
3125 pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS) 3177 pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS)
3126 continue; 3178 continue;
3127 3179
@@ -3278,7 +3330,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
3278 spin_unlock(&pa->pa_lock); 3330 spin_unlock(&pa->pa_lock);
3279 3331
3280 grp_blk = pa->pa_pstart; 3332 grp_blk = pa->pa_pstart;
3281 /* 3333 /*
3282 * If doing group-based preallocation, pa_pstart may be in the 3334 * If doing group-based preallocation, pa_pstart may be in the
3283 * next group when pa is used up 3335 * next group when pa is used up
3284 */ 3336 */
@@ -3695,7 +3747,7 @@ out:
3695 ext4_unlock_group(sb, group); 3747 ext4_unlock_group(sb, group);
3696 if (ac) 3748 if (ac)
3697 kmem_cache_free(ext4_ac_cachep, ac); 3749 kmem_cache_free(ext4_ac_cachep, ac);
3698 ext4_mb_release_desc(&e4b); 3750 ext4_mb_unload_buddy(&e4b);
3699 put_bh(bitmap_bh); 3751 put_bh(bitmap_bh);
3700 return free; 3752 return free;
3701} 3753}
@@ -3799,7 +3851,7 @@ repeat:
3799 if (bitmap_bh == NULL) { 3851 if (bitmap_bh == NULL) {
3800 ext4_error(sb, "Error reading block bitmap for %u", 3852 ext4_error(sb, "Error reading block bitmap for %u",
3801 group); 3853 group);
3802 ext4_mb_release_desc(&e4b); 3854 ext4_mb_unload_buddy(&e4b);
3803 continue; 3855 continue;
3804 } 3856 }
3805 3857
@@ -3808,7 +3860,7 @@ repeat:
3808 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac); 3860 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac);
3809 ext4_unlock_group(sb, group); 3861 ext4_unlock_group(sb, group);
3810 3862
3811 ext4_mb_release_desc(&e4b); 3863 ext4_mb_unload_buddy(&e4b);
3812 put_bh(bitmap_bh); 3864 put_bh(bitmap_bh);
3813 3865
3814 list_del(&pa->u.pa_tmp_list); 3866 list_del(&pa->u.pa_tmp_list);
@@ -4072,7 +4124,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
4072 ext4_mb_release_group_pa(&e4b, pa, ac); 4124 ext4_mb_release_group_pa(&e4b, pa, ac);
4073 ext4_unlock_group(sb, group); 4125 ext4_unlock_group(sb, group);
4074 4126
4075 ext4_mb_release_desc(&e4b); 4127 ext4_mb_unload_buddy(&e4b);
4076 list_del(&pa->u.pa_tmp_list); 4128 list_del(&pa->u.pa_tmp_list);
4077 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); 4129 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
4078 } 4130 }
@@ -4482,12 +4534,12 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
4482 if (!bh) 4534 if (!bh)
4483 tbh = sb_find_get_block(inode->i_sb, 4535 tbh = sb_find_get_block(inode->i_sb,
4484 block + i); 4536 block + i);
4485 ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, 4537 ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
4486 inode, tbh, block + i); 4538 inode, tbh, block + i);
4487 } 4539 }
4488 } 4540 }
4489 4541
4490 /* 4542 /*
4491 * We need to make sure we don't reuse the freed block until 4543 * We need to make sure we don't reuse the freed block until
4492 * after the transaction is committed, which we can do by 4544 * after the transaction is committed, which we can do by
4493 * treating the block as metadata, below. We make an 4545 * treating the block as metadata, below. We make an
@@ -4608,7 +4660,7 @@ do_more:
4608 atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks); 4660 atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks);
4609 } 4661 }
4610 4662
4611 ext4_mb_release_desc(&e4b); 4663 ext4_mb_unload_buddy(&e4b);
4612 4664
4613 freed += count; 4665 freed += count;
4614 4666
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 8b87bd0eac95..6f3a27ec30bf 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -13,6 +13,7 @@
13 */ 13 */
14 14
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/slab.h>
16#include "ext4_jbd2.h" 17#include "ext4_jbd2.h"
17#include "ext4_extents.h" 18#include "ext4_extents.h"
18 19
@@ -474,7 +475,7 @@ int ext4_ext_migrate(struct inode *inode)
474 */ 475 */
475 if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb, 476 if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
476 EXT4_FEATURE_INCOMPAT_EXTENTS) || 477 EXT4_FEATURE_INCOMPAT_EXTENTS) ||
477 (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) 478 (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
478 return -EINVAL; 479 return -EINVAL;
479 480
480 if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0) 481 if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0)
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index aa5fe28d180f..3a6c92ac131c 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -15,6 +15,7 @@
15 15
16#include <linux/fs.h> 16#include <linux/fs.h>
17#include <linux/quotaops.h> 17#include <linux/quotaops.h>
18#include <linux/slab.h>
18#include "ext4_jbd2.h" 19#include "ext4_jbd2.h"
19#include "ext4_extents.h" 20#include "ext4_extents.h"
20#include "ext4.h" 21#include "ext4.h"
@@ -481,6 +482,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
481 int depth = ext_depth(orig_inode); 482 int depth = ext_depth(orig_inode);
482 int ret; 483 int ret;
483 484
485 start_ext.ee_block = end_ext.ee_block = 0;
484 o_start = o_end = oext = orig_path[depth].p_ext; 486 o_start = o_end = oext = orig_path[depth].p_ext;
485 oext_alen = ext4_ext_get_actual_len(oext); 487 oext_alen = ext4_ext_get_actual_len(oext);
486 start_ext.ee_len = end_ext.ee_len = 0; 488 start_ext.ee_len = end_ext.ee_len = 0;
@@ -528,7 +530,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
528 * new_ext |-------| 530 * new_ext |-------|
529 */ 531 */
530 if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) { 532 if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) {
531 ext4_error(orig_inode->i_sb, 533 EXT4_ERROR_INODE(orig_inode,
532 "new_ext_end(%u) should be less than or equal to " 534 "new_ext_end(%u) should be less than or equal to "
533 "oext->ee_block(%u) + oext_alen(%d) - 1", 535 "oext->ee_block(%u) + oext_alen(%d) - 1",
534 new_ext_end, le32_to_cpu(oext->ee_block), 536 new_ext_end, le32_to_cpu(oext->ee_block),
@@ -691,12 +693,12 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
691 while (1) { 693 while (1) {
692 /* The extent for donor must be found. */ 694 /* The extent for donor must be found. */
693 if (!dext) { 695 if (!dext) {
694 ext4_error(donor_inode->i_sb, 696 EXT4_ERROR_INODE(donor_inode,
695 "The extent for donor must be found"); 697 "The extent for donor must be found");
696 *err = -EIO; 698 *err = -EIO;
697 goto out; 699 goto out;
698 } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) { 700 } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) {
699 ext4_error(donor_inode->i_sb, 701 EXT4_ERROR_INODE(donor_inode,
700 "Donor offset(%u) and the first block of donor " 702 "Donor offset(%u) and the first block of donor "
701 "extent(%u) should be equal", 703 "extent(%u) should be equal",
702 donor_off, 704 donor_off,
@@ -975,11 +977,11 @@ mext_check_arguments(struct inode *orig_inode,
975 } 977 }
976 978
977 /* Ext4 move extent supports only extent based file */ 979 /* Ext4 move extent supports only extent based file */
978 if (!(EXT4_I(orig_inode)->i_flags & EXT4_EXTENTS_FL)) { 980 if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) {
979 ext4_debug("ext4 move extent: orig file is not extents " 981 ext4_debug("ext4 move extent: orig file is not extents "
980 "based file [ino:orig %lu]\n", orig_inode->i_ino); 982 "based file [ino:orig %lu]\n", orig_inode->i_ino);
981 return -EOPNOTSUPP; 983 return -EOPNOTSUPP;
982 } else if (!(EXT4_I(donor_inode)->i_flags & EXT4_EXTENTS_FL)) { 984 } else if (!(ext4_test_inode_flag(donor_inode, EXT4_INODE_EXTENTS))) {
983 ext4_debug("ext4 move extent: donor file is not extents " 985 ext4_debug("ext4 move extent: donor file is not extents "
984 "based file [ino:donor %lu]\n", donor_inode->i_ino); 986 "based file [ino:donor %lu]\n", donor_inode->i_ino);
985 return -EOPNOTSUPP; 987 return -EOPNOTSUPP;
@@ -1353,7 +1355,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1353 if (ret1 < 0) 1355 if (ret1 < 0)
1354 break; 1356 break;
1355 if (*moved_len > len) { 1357 if (*moved_len > len) {
1356 ext4_error(orig_inode->i_sb, 1358 EXT4_ERROR_INODE(orig_inode,
1357 "We replaced blocks too much! " 1359 "We replaced blocks too much! "
1358 "sum of replaced: %llu requested: %llu", 1360 "sum of replaced: %llu requested: %llu",
1359 *moved_len, len); 1361 *moved_len, len);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 0c070fabd108..a43e6617b351 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -187,7 +187,7 @@ unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize)
187 return blocksize; 187 return blocksize;
188 return (len & 65532) | ((len & 3) << 16); 188 return (len & 65532) | ((len & 3) << 16);
189} 189}
190 190
191__le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) 191__le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
192{ 192{
193 if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3)) 193 if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3))
@@ -197,7 +197,7 @@ __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
197 if (len == blocksize) { 197 if (len == blocksize) {
198 if (blocksize == 65536) 198 if (blocksize == 65536)
199 return cpu_to_le16(EXT4_MAX_REC_LEN); 199 return cpu_to_le16(EXT4_MAX_REC_LEN);
200 else 200 else
201 return cpu_to_le16(0); 201 return cpu_to_le16(0);
202 } 202 }
203 return cpu_to_le16((len & 65532) | ((len >> 16) & 3)); 203 return cpu_to_le16((len & 65532) | ((len >> 16) & 3));
@@ -349,7 +349,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
349 brelse(bh); 349 brelse(bh);
350 } 350 }
351 if (bcount) 351 if (bcount)
352 printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n", 352 printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n",
353 levels ? "" : " ", names, space/bcount, 353 levels ? "" : " ", names, space/bcount,
354 (space/bcount)*100/blocksize); 354 (space/bcount)*100/blocksize);
355 return (struct stats) { names, space, bcount}; 355 return (struct stats) { names, space, bcount};
@@ -653,10 +653,10 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
653 int ret, err; 653 int ret, err;
654 __u32 hashval; 654 __u32 hashval;
655 655
656 dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n", 656 dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n",
657 start_hash, start_minor_hash)); 657 start_hash, start_minor_hash));
658 dir = dir_file->f_path.dentry->d_inode; 658 dir = dir_file->f_path.dentry->d_inode;
659 if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) { 659 if (!(ext4_test_inode_flag(dir, EXT4_INODE_INDEX))) {
660 hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; 660 hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
661 if (hinfo.hash_version <= DX_HASH_TEA) 661 if (hinfo.hash_version <= DX_HASH_TEA)
662 hinfo.hash_version += 662 hinfo.hash_version +=
@@ -801,7 +801,7 @@ static void ext4_update_dx_flag(struct inode *inode)
801{ 801{
802 if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb, 802 if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
803 EXT4_FEATURE_COMPAT_DIR_INDEX)) 803 EXT4_FEATURE_COMPAT_DIR_INDEX))
804 EXT4_I(inode)->i_flags &= ~EXT4_INDEX_FL; 804 ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
805} 805}
806 806
807/* 807/*
@@ -943,8 +943,8 @@ restart:
943 wait_on_buffer(bh); 943 wait_on_buffer(bh);
944 if (!buffer_uptodate(bh)) { 944 if (!buffer_uptodate(bh)) {
945 /* read error, skip block & hope for the best */ 945 /* read error, skip block & hope for the best */
946 ext4_error(sb, "reading directory #%lu offset %lu", 946 EXT4_ERROR_INODE(dir, "reading directory lblock %lu",
947 dir->i_ino, (unsigned long)block); 947 (unsigned long) block);
948 brelse(bh); 948 brelse(bh);
949 goto next; 949 goto next;
950 } 950 }
@@ -1066,15 +1066,15 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
1066 __u32 ino = le32_to_cpu(de->inode); 1066 __u32 ino = le32_to_cpu(de->inode);
1067 brelse(bh); 1067 brelse(bh);
1068 if (!ext4_valid_inum(dir->i_sb, ino)) { 1068 if (!ext4_valid_inum(dir->i_sb, ino)) {
1069 ext4_error(dir->i_sb, "bad inode number: %u", ino); 1069 EXT4_ERROR_INODE(dir, "bad inode number: %u", ino);
1070 return ERR_PTR(-EIO); 1070 return ERR_PTR(-EIO);
1071 } 1071 }
1072 inode = ext4_iget(dir->i_sb, ino); 1072 inode = ext4_iget(dir->i_sb, ino);
1073 if (unlikely(IS_ERR(inode))) { 1073 if (unlikely(IS_ERR(inode))) {
1074 if (PTR_ERR(inode) == -ESTALE) { 1074 if (PTR_ERR(inode) == -ESTALE) {
1075 ext4_error(dir->i_sb, 1075 EXT4_ERROR_INODE(dir,
1076 "deleted inode referenced: %u", 1076 "deleted inode referenced: %u",
1077 ino); 1077 ino);
1078 return ERR_PTR(-EIO); 1078 return ERR_PTR(-EIO);
1079 } else { 1079 } else {
1080 return ERR_CAST(inode); 1080 return ERR_CAST(inode);
@@ -1104,8 +1104,8 @@ struct dentry *ext4_get_parent(struct dentry *child)
1104 brelse(bh); 1104 brelse(bh);
1105 1105
1106 if (!ext4_valid_inum(child->d_inode->i_sb, ino)) { 1106 if (!ext4_valid_inum(child->d_inode->i_sb, ino)) {
1107 ext4_error(child->d_inode->i_sb, 1107 EXT4_ERROR_INODE(child->d_inode,
1108 "bad inode number: %u", ino); 1108 "bad parent inode number: %u", ino);
1109 return ERR_PTR(-EIO); 1109 return ERR_PTR(-EIO);
1110 } 1110 }
1111 1111
@@ -1141,7 +1141,7 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count,
1141 unsigned rec_len = 0; 1141 unsigned rec_len = 0;
1142 1142
1143 while (count--) { 1143 while (count--) {
1144 struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) 1144 struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *)
1145 (from + (map->offs<<2)); 1145 (from + (map->offs<<2));
1146 rec_len = EXT4_DIR_REC_LEN(de->name_len); 1146 rec_len = EXT4_DIR_REC_LEN(de->name_len);
1147 memcpy (to, de, rec_len); 1147 memcpy (to, de, rec_len);
@@ -1404,9 +1404,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1404 de = (struct ext4_dir_entry_2 *)((char *)fde + 1404 de = (struct ext4_dir_entry_2 *)((char *)fde +
1405 ext4_rec_len_from_disk(fde->rec_len, blocksize)); 1405 ext4_rec_len_from_disk(fde->rec_len, blocksize));
1406 if ((char *) de >= (((char *) root) + blocksize)) { 1406 if ((char *) de >= (((char *) root) + blocksize)) {
1407 ext4_error(dir->i_sb, 1407 EXT4_ERROR_INODE(dir, "invalid rec_len for '..'");
1408 "invalid rec_len for '..' in inode %lu",
1409 dir->i_ino);
1410 brelse(bh); 1408 brelse(bh);
1411 return -EIO; 1409 return -EIO;
1412 } 1410 }
@@ -1418,7 +1416,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1418 brelse(bh); 1416 brelse(bh);
1419 return retval; 1417 return retval;
1420 } 1418 }
1421 EXT4_I(dir)->i_flags |= EXT4_INDEX_FL; 1419 ext4_set_inode_flag(dir, EXT4_INODE_INDEX);
1422 data1 = bh2->b_data; 1420 data1 = bh2->b_data;
1423 1421
1424 memcpy (data1, de, len); 1422 memcpy (data1, de, len);
@@ -1491,7 +1489,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1491 retval = ext4_dx_add_entry(handle, dentry, inode); 1489 retval = ext4_dx_add_entry(handle, dentry, inode);
1492 if (!retval || (retval != ERR_BAD_DX_DIR)) 1490 if (!retval || (retval != ERR_BAD_DX_DIR))
1493 return retval; 1491 return retval;
1494 EXT4_I(dir)->i_flags &= ~EXT4_INDEX_FL; 1492 ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
1495 dx_fallback++; 1493 dx_fallback++;
1496 ext4_mark_inode_dirty(handle, dir); 1494 ext4_mark_inode_dirty(handle, dir);
1497 } 1495 }
@@ -1519,6 +1517,8 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1519 de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize); 1517 de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
1520 retval = add_dirent_to_buf(handle, dentry, inode, de, bh); 1518 retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
1521 brelse(bh); 1519 brelse(bh);
1520 if (retval == 0)
1521 ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY);
1522 return retval; 1522 return retval;
1523} 1523}
1524 1524
@@ -1915,9 +1915,8 @@ static int empty_dir(struct inode *inode)
1915 if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) || 1915 if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) ||
1916 !(bh = ext4_bread(NULL, inode, 0, 0, &err))) { 1916 !(bh = ext4_bread(NULL, inode, 0, 0, &err))) {
1917 if (err) 1917 if (err)
1918 ext4_error(inode->i_sb, 1918 EXT4_ERROR_INODE(inode,
1919 "error %d reading directory #%lu offset 0", 1919 "error %d reading directory lblock 0", err);
1920 err, inode->i_ino);
1921 else 1920 else
1922 ext4_warning(inode->i_sb, 1921 ext4_warning(inode->i_sb,
1923 "bad directory (dir #%lu) - no data block", 1922 "bad directory (dir #%lu) - no data block",
@@ -1941,17 +1940,17 @@ static int empty_dir(struct inode *inode)
1941 de = ext4_next_entry(de1, sb->s_blocksize); 1940 de = ext4_next_entry(de1, sb->s_blocksize);
1942 while (offset < inode->i_size) { 1941 while (offset < inode->i_size) {
1943 if (!bh || 1942 if (!bh ||
1944 (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) { 1943 (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
1944 unsigned int lblock;
1945 err = 0; 1945 err = 0;
1946 brelse(bh); 1946 brelse(bh);
1947 bh = ext4_bread(NULL, inode, 1947 lblock = offset >> EXT4_BLOCK_SIZE_BITS(sb);
1948 offset >> EXT4_BLOCK_SIZE_BITS(sb), 0, &err); 1948 bh = ext4_bread(NULL, inode, lblock, 0, &err);
1949 if (!bh) { 1949 if (!bh) {
1950 if (err) 1950 if (err)
1951 ext4_error(sb, 1951 EXT4_ERROR_INODE(inode,
1952 "error %d reading directory" 1952 "error %d reading directory "
1953 " #%lu offset %u", 1953 "lblock %u", err, lblock);
1954 err, inode->i_ino, offset);
1955 offset += sb->s_blocksize; 1954 offset += sb->s_blocksize;
1956 continue; 1955 continue;
1957 } 1956 }
@@ -2297,7 +2296,7 @@ retry:
2297 } 2296 }
2298 } else { 2297 } else {
2299 /* clear the extent format for fast symlink */ 2298 /* clear the extent format for fast symlink */
2300 EXT4_I(inode)->i_flags &= ~EXT4_EXTENTS_FL; 2299 ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
2301 inode->i_op = &ext4_fast_symlink_inode_operations; 2300 inode->i_op = &ext4_fast_symlink_inode_operations;
2302 memcpy((char *)&EXT4_I(inode)->i_data, symname, l); 2301 memcpy((char *)&EXT4_I(inode)->i_data, symname, l);
2303 inode->i_size = l-1; 2302 inode->i_size = l-1;
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 5692c48754a0..6df797eb9aeb 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -911,7 +911,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
911 percpu_counter_add(&sbi->s_freeinodes_counter, 911 percpu_counter_add(&sbi->s_freeinodes_counter,
912 EXT4_INODES_PER_GROUP(sb)); 912 EXT4_INODES_PER_GROUP(sb));
913 913
914 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) { 914 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
915 sbi->s_log_groups_per_flex) {
915 ext4_group_t flex_group; 916 ext4_group_t flex_group;
916 flex_group = ext4_flex_group(sbi, input->group); 917 flex_group = ext4_flex_group(sbi, input->group);
917 atomic_add(input->free_blocks_count, 918 atomic_add(input->free_blocks_count,
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index ba191dae8730..4e8983a9811b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -68,7 +68,21 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
68static int ext4_unfreeze(struct super_block *sb); 68static int ext4_unfreeze(struct super_block *sb);
69static void ext4_write_super(struct super_block *sb); 69static void ext4_write_super(struct super_block *sb);
70static int ext4_freeze(struct super_block *sb); 70static int ext4_freeze(struct super_block *sb);
71static int ext4_get_sb(struct file_system_type *fs_type, int flags,
72 const char *dev_name, void *data, struct vfsmount *mnt);
71 73
74#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
75static struct file_system_type ext3_fs_type = {
76 .owner = THIS_MODULE,
77 .name = "ext3",
78 .get_sb = ext4_get_sb,
79 .kill_sb = kill_block_super,
80 .fs_flags = FS_REQUIRES_DEV,
81};
82#define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type)
83#else
84#define IS_EXT3_SB(sb) (0)
85#endif
72 86
73ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, 87ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
74 struct ext4_group_desc *bg) 88 struct ext4_group_desc *bg)
@@ -227,6 +241,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
227 if (sb->s_flags & MS_RDONLY) 241 if (sb->s_flags & MS_RDONLY)
228 return ERR_PTR(-EROFS); 242 return ERR_PTR(-EROFS);
229 243
244 vfs_check_frozen(sb, SB_FREEZE_WRITE);
230 /* Special case here: if the journal has aborted behind our 245 /* Special case here: if the journal has aborted behind our
231 * backs (eg. EIO in the commit thread), then we still need to 246 * backs (eg. EIO in the commit thread), then we still need to
232 * take the FS itself readonly cleanly. */ 247 * take the FS itself readonly cleanly. */
@@ -631,6 +646,8 @@ static void ext4_put_super(struct super_block *sb)
631 struct ext4_super_block *es = sbi->s_es; 646 struct ext4_super_block *es = sbi->s_es;
632 int i, err; 647 int i, err;
633 648
649 dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
650
634 flush_workqueue(sbi->dio_unwritten_wq); 651 flush_workqueue(sbi->dio_unwritten_wq);
635 destroy_workqueue(sbi->dio_unwritten_wq); 652 destroy_workqueue(sbi->dio_unwritten_wq);
636 653
@@ -927,6 +944,8 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
927 seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0"); 944 seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0");
928 if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) 945 if (test_opt(sb, JOURNAL_ASYNC_COMMIT))
929 seq_puts(seq, ",journal_async_commit"); 946 seq_puts(seq, ",journal_async_commit");
947 else if (test_opt(sb, JOURNAL_CHECKSUM))
948 seq_puts(seq, ",journal_checksum");
930 if (test_opt(sb, NOBH)) 949 if (test_opt(sb, NOBH))
931 seq_puts(seq, ",nobh"); 950 seq_puts(seq, ",nobh");
932 if (test_opt(sb, I_VERSION)) 951 if (test_opt(sb, I_VERSION))
@@ -1045,7 +1064,7 @@ static int ext4_release_dquot(struct dquot *dquot);
1045static int ext4_mark_dquot_dirty(struct dquot *dquot); 1064static int ext4_mark_dquot_dirty(struct dquot *dquot);
1046static int ext4_write_info(struct super_block *sb, int type); 1065static int ext4_write_info(struct super_block *sb, int type);
1047static int ext4_quota_on(struct super_block *sb, int type, int format_id, 1066static int ext4_quota_on(struct super_block *sb, int type, int format_id,
1048 char *path, int remount); 1067 char *path);
1049static int ext4_quota_on_mount(struct super_block *sb, int type); 1068static int ext4_quota_on_mount(struct super_block *sb, int type);
1050static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, 1069static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
1051 size_t len, loff_t off); 1070 size_t len, loff_t off);
@@ -1067,12 +1086,12 @@ static const struct dquot_operations ext4_quota_operations = {
1067 1086
1068static const struct quotactl_ops ext4_qctl_operations = { 1087static const struct quotactl_ops ext4_qctl_operations = {
1069 .quota_on = ext4_quota_on, 1088 .quota_on = ext4_quota_on,
1070 .quota_off = vfs_quota_off, 1089 .quota_off = dquot_quota_off,
1071 .quota_sync = vfs_quota_sync, 1090 .quota_sync = dquot_quota_sync,
1072 .get_info = vfs_get_dqinfo, 1091 .get_info = dquot_get_dqinfo,
1073 .set_info = vfs_set_dqinfo, 1092 .set_info = dquot_set_dqinfo,
1074 .get_dqblk = vfs_get_dqblk, 1093 .get_dqblk = dquot_get_dqblk,
1075 .set_dqblk = vfs_set_dqblk 1094 .set_dqblk = dquot_set_dqblk
1076}; 1095};
1077#endif 1096#endif
1078 1097
@@ -2037,7 +2056,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
2037 /* Turn quotas off */ 2056 /* Turn quotas off */
2038 for (i = 0; i < MAXQUOTAS; i++) { 2057 for (i = 0; i < MAXQUOTAS; i++) {
2039 if (sb_dqopt(sb)->files[i]) 2058 if (sb_dqopt(sb)->files[i])
2040 vfs_quota_off(sb, i, 0); 2059 dquot_quota_off(sb, i);
2041 } 2060 }
2042#endif 2061#endif
2043 sb->s_flags = s_flags; /* Restore MS_RDONLY status */ 2062 sb->s_flags = s_flags; /* Restore MS_RDONLY status */
@@ -2199,7 +2218,7 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
2199struct ext4_attr { 2218struct ext4_attr {
2200 struct attribute attr; 2219 struct attribute attr;
2201 ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *); 2220 ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *);
2202 ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *, 2221 ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *,
2203 const char *, size_t); 2222 const char *, size_t);
2204 int offset; 2223 int offset;
2205}; 2224};
@@ -2416,6 +2435,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2416 __releases(kernel_lock) 2435 __releases(kernel_lock)
2417 __acquires(kernel_lock) 2436 __acquires(kernel_lock)
2418{ 2437{
2438 char *orig_data = kstrdup(data, GFP_KERNEL);
2419 struct buffer_head *bh; 2439 struct buffer_head *bh;
2420 struct ext4_super_block *es = NULL; 2440 struct ext4_super_block *es = NULL;
2421 struct ext4_sb_info *sbi; 2441 struct ext4_sb_info *sbi;
@@ -2539,7 +2559,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2539 * enable delayed allocation by default 2559 * enable delayed allocation by default
2540 * Use -o nodelalloc to turn it off 2560 * Use -o nodelalloc to turn it off
2541 */ 2561 */
2542 set_opt(sbi->s_mount_opt, DELALLOC); 2562 if (!IS_EXT3_SB(sb))
2563 set_opt(sbi->s_mount_opt, DELALLOC);
2543 2564
2544 if (!parse_options((char *) data, sb, &journal_devnum, 2565 if (!parse_options((char *) data, sb, &journal_devnum,
2545 &journal_ioprio, NULL, 0)) 2566 &journal_ioprio, NULL, 0))
@@ -2778,24 +2799,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2778 get_random_bytes(&sbi->s_next_generation, sizeof(u32)); 2799 get_random_bytes(&sbi->s_next_generation, sizeof(u32));
2779 spin_lock_init(&sbi->s_next_gen_lock); 2800 spin_lock_init(&sbi->s_next_gen_lock);
2780 2801
2781 err = percpu_counter_init(&sbi->s_freeblocks_counter,
2782 ext4_count_free_blocks(sb));
2783 if (!err) {
2784 err = percpu_counter_init(&sbi->s_freeinodes_counter,
2785 ext4_count_free_inodes(sb));
2786 }
2787 if (!err) {
2788 err = percpu_counter_init(&sbi->s_dirs_counter,
2789 ext4_count_dirs(sb));
2790 }
2791 if (!err) {
2792 err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
2793 }
2794 if (err) {
2795 ext4_msg(sb, KERN_ERR, "insufficient memory");
2796 goto failed_mount3;
2797 }
2798
2799 sbi->s_stripe = ext4_get_stripe_size(sbi); 2802 sbi->s_stripe = ext4_get_stripe_size(sbi);
2800 sbi->s_max_writeback_mb_bump = 128; 2803 sbi->s_max_writeback_mb_bump = 128;
2801 2804
@@ -2895,6 +2898,20 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2895 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); 2898 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
2896 2899
2897no_journal: 2900no_journal:
2901 err = percpu_counter_init(&sbi->s_freeblocks_counter,
2902 ext4_count_free_blocks(sb));
2903 if (!err)
2904 err = percpu_counter_init(&sbi->s_freeinodes_counter,
2905 ext4_count_free_inodes(sb));
2906 if (!err)
2907 err = percpu_counter_init(&sbi->s_dirs_counter,
2908 ext4_count_dirs(sb));
2909 if (!err)
2910 err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
2911 if (err) {
2912 ext4_msg(sb, KERN_ERR, "insufficient memory");
2913 goto failed_mount_wq;
2914 }
2898 if (test_opt(sb, NOBH)) { 2915 if (test_opt(sb, NOBH)) {
2899 if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) { 2916 if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {
2900 ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - " 2917 ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - "
@@ -2986,7 +3003,7 @@ no_journal:
2986 err = ext4_setup_system_zone(sb); 3003 err = ext4_setup_system_zone(sb);
2987 if (err) { 3004 if (err) {
2988 ext4_msg(sb, KERN_ERR, "failed to initialize system " 3005 ext4_msg(sb, KERN_ERR, "failed to initialize system "
2989 "zone (%d)\n", err); 3006 "zone (%d)", err);
2990 goto failed_mount4; 3007 goto failed_mount4;
2991 } 3008 }
2992 3009
@@ -3025,9 +3042,11 @@ no_journal:
3025 } else 3042 } else
3026 descr = "out journal"; 3043 descr = "out journal";
3027 3044
3028 ext4_msg(sb, KERN_INFO, "mounted filesystem with%s", descr); 3045 ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
3046 "Opts: %s", descr, orig_data);
3029 3047
3030 lock_kernel(); 3048 lock_kernel();
3049 kfree(orig_data);
3031 return 0; 3050 return 0;
3032 3051
3033cantfind_ext4: 3052cantfind_ext4:
@@ -3044,6 +3063,10 @@ failed_mount_wq:
3044 jbd2_journal_destroy(sbi->s_journal); 3063 jbd2_journal_destroy(sbi->s_journal);
3045 sbi->s_journal = NULL; 3064 sbi->s_journal = NULL;
3046 } 3065 }
3066 percpu_counter_destroy(&sbi->s_freeblocks_counter);
3067 percpu_counter_destroy(&sbi->s_freeinodes_counter);
3068 percpu_counter_destroy(&sbi->s_dirs_counter);
3069 percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
3047failed_mount3: 3070failed_mount3:
3048 if (sbi->s_flex_groups) { 3071 if (sbi->s_flex_groups) {
3049 if (is_vmalloc_addr(sbi->s_flex_groups)) 3072 if (is_vmalloc_addr(sbi->s_flex_groups))
@@ -3051,10 +3074,6 @@ failed_mount3:
3051 else 3074 else
3052 kfree(sbi->s_flex_groups); 3075 kfree(sbi->s_flex_groups);
3053 } 3076 }
3054 percpu_counter_destroy(&sbi->s_freeblocks_counter);
3055 percpu_counter_destroy(&sbi->s_freeinodes_counter);
3056 percpu_counter_destroy(&sbi->s_dirs_counter);
3057 percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
3058failed_mount2: 3077failed_mount2:
3059 for (i = 0; i < db_count; i++) 3078 for (i = 0; i < db_count; i++)
3060 brelse(sbi->s_group_desc[i]); 3079 brelse(sbi->s_group_desc[i]);
@@ -3074,6 +3093,7 @@ out_fail:
3074 kfree(sbi->s_blockgroup_lock); 3093 kfree(sbi->s_blockgroup_lock);
3075 kfree(sbi); 3094 kfree(sbi);
3076 lock_kernel(); 3095 lock_kernel();
3096 kfree(orig_data);
3077 return ret; 3097 return ret;
3078} 3098}
3079 3099
@@ -3365,7 +3385,7 @@ static int ext4_commit_super(struct super_block *sb, int sync)
3365 if (!(sb->s_flags & MS_RDONLY)) 3385 if (!(sb->s_flags & MS_RDONLY))
3366 es->s_wtime = cpu_to_le32(get_seconds()); 3386 es->s_wtime = cpu_to_le32(get_seconds());
3367 es->s_kbytes_written = 3387 es->s_kbytes_written =
3368 cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + 3388 cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
3369 ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) - 3389 ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
3370 EXT4_SB(sb)->s_sectors_written_start) >> 1)); 3390 EXT4_SB(sb)->s_sectors_written_start) >> 1));
3371 ext4_free_blocks_count_set(es, percpu_counter_sum_positive( 3391 ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
@@ -3470,8 +3490,10 @@ int ext4_force_commit(struct super_block *sb)
3470 return 0; 3490 return 0;
3471 3491
3472 journal = EXT4_SB(sb)->s_journal; 3492 journal = EXT4_SB(sb)->s_journal;
3473 if (journal) 3493 if (journal) {
3494 vfs_check_frozen(sb, SB_FREEZE_WRITE);
3474 ret = ext4_journal_force_commit(journal); 3495 ret = ext4_journal_force_commit(journal);
3496 }
3475 3497
3476 return ret; 3498 return ret;
3477} 3499}
@@ -3520,18 +3542,16 @@ static int ext4_freeze(struct super_block *sb)
3520 * the journal. 3542 * the journal.
3521 */ 3543 */
3522 error = jbd2_journal_flush(journal); 3544 error = jbd2_journal_flush(journal);
3523 if (error < 0) { 3545 if (error < 0)
3524 out: 3546 goto out;
3525 jbd2_journal_unlock_updates(journal);
3526 return error;
3527 }
3528 3547
3529 /* Journal blocked and flushed, clear needs_recovery flag. */ 3548 /* Journal blocked and flushed, clear needs_recovery flag. */
3530 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 3549 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
3531 error = ext4_commit_super(sb, 1); 3550 error = ext4_commit_super(sb, 1);
3532 if (error) 3551out:
3533 goto out; 3552 /* we rely on s_frozen to stop further updates */
3534 return 0; 3553 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
3554 return error;
3535} 3555}
3536 3556
3537/* 3557/*
@@ -3548,7 +3568,6 @@ static int ext4_unfreeze(struct super_block *sb)
3548 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 3568 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
3549 ext4_commit_super(sb, 1); 3569 ext4_commit_super(sb, 1);
3550 unlock_super(sb); 3570 unlock_super(sb);
3551 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
3552 return 0; 3571 return 0;
3553} 3572}
3554 3573
@@ -3559,12 +3578,14 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3559 ext4_fsblk_t n_blocks_count = 0; 3578 ext4_fsblk_t n_blocks_count = 0;
3560 unsigned long old_sb_flags; 3579 unsigned long old_sb_flags;
3561 struct ext4_mount_options old_opts; 3580 struct ext4_mount_options old_opts;
3581 int enable_quota = 0;
3562 ext4_group_t g; 3582 ext4_group_t g;
3563 unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; 3583 unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
3564 int err; 3584 int err;
3565#ifdef CONFIG_QUOTA 3585#ifdef CONFIG_QUOTA
3566 int i; 3586 int i;
3567#endif 3587#endif
3588 char *orig_data = kstrdup(data, GFP_KERNEL);
3568 3589
3569 lock_kernel(); 3590 lock_kernel();
3570 3591
@@ -3615,6 +3636,10 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3615 } 3636 }
3616 3637
3617 if (*flags & MS_RDONLY) { 3638 if (*flags & MS_RDONLY) {
3639 err = dquot_suspend(sb, -1);
3640 if (err < 0)
3641 goto restore_opts;
3642
3618 /* 3643 /*
3619 * First of all, the unconditional stuff we have to do 3644 * First of all, the unconditional stuff we have to do
3620 * to disable replay of the journal when we next remount 3645 * to disable replay of the journal when we next remount
@@ -3683,6 +3708,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3683 goto restore_opts; 3708 goto restore_opts;
3684 if (!ext4_setup_super(sb, es, 0)) 3709 if (!ext4_setup_super(sb, es, 0))
3685 sb->s_flags &= ~MS_RDONLY; 3710 sb->s_flags &= ~MS_RDONLY;
3711 enable_quota = 1;
3686 } 3712 }
3687 } 3713 }
3688 ext4_setup_system_zone(sb); 3714 ext4_setup_system_zone(sb);
@@ -3698,6 +3724,11 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3698#endif 3724#endif
3699 unlock_super(sb); 3725 unlock_super(sb);
3700 unlock_kernel(); 3726 unlock_kernel();
3727 if (enable_quota)
3728 dquot_resume(sb, -1);
3729
3730 ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data);
3731 kfree(orig_data);
3701 return 0; 3732 return 0;
3702 3733
3703restore_opts: 3734restore_opts:
@@ -3719,6 +3750,7 @@ restore_opts:
3719#endif 3750#endif
3720 unlock_super(sb); 3751 unlock_super(sb);
3721 unlock_kernel(); 3752 unlock_kernel();
3753 kfree(orig_data);
3722 return err; 3754 return err;
3723} 3755}
3724 3756
@@ -3891,24 +3923,21 @@ static int ext4_write_info(struct super_block *sb, int type)
3891 */ 3923 */
3892static int ext4_quota_on_mount(struct super_block *sb, int type) 3924static int ext4_quota_on_mount(struct super_block *sb, int type)
3893{ 3925{
3894 return vfs_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type], 3926 return dquot_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type],
3895 EXT4_SB(sb)->s_jquota_fmt, type); 3927 EXT4_SB(sb)->s_jquota_fmt, type);
3896} 3928}
3897 3929
3898/* 3930/*
3899 * Standard function to be called on quota_on 3931 * Standard function to be called on quota_on
3900 */ 3932 */
3901static int ext4_quota_on(struct super_block *sb, int type, int format_id, 3933static int ext4_quota_on(struct super_block *sb, int type, int format_id,
3902 char *name, int remount) 3934 char *name)
3903{ 3935{
3904 int err; 3936 int err;
3905 struct path path; 3937 struct path path;
3906 3938
3907 if (!test_opt(sb, QUOTA)) 3939 if (!test_opt(sb, QUOTA))
3908 return -EINVAL; 3940 return -EINVAL;
3909 /* When remounting, no checks are needed and in fact, name is NULL */
3910 if (remount)
3911 return vfs_quota_on(sb, type, format_id, name, remount);
3912 3941
3913 err = kern_path(name, LOOKUP_FOLLOW, &path); 3942 err = kern_path(name, LOOKUP_FOLLOW, &path);
3914 if (err) 3943 if (err)
@@ -3947,7 +3976,7 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
3947 } 3976 }
3948 } 3977 }
3949 3978
3950 err = vfs_quota_on_path(sb, type, format_id, &path); 3979 err = dquot_quota_on_path(sb, type, format_id, &path);
3951 path_put(&path); 3980 path_put(&path);
3952 return err; 3981 return err;
3953} 3982}
@@ -4068,7 +4097,7 @@ static int ext4_get_sb(struct file_system_type *fs_type, int flags,
4068 return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt); 4097 return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt);
4069} 4098}
4070 4099
4071#if !defined(CONTIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) 4100#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
4072static struct file_system_type ext2_fs_type = { 4101static struct file_system_type ext2_fs_type = {
4073 .owner = THIS_MODULE, 4102 .owner = THIS_MODULE,
4074 .name = "ext2", 4103 .name = "ext2",
@@ -4095,15 +4124,7 @@ static inline void register_as_ext2(void) { }
4095static inline void unregister_as_ext2(void) { } 4124static inline void unregister_as_ext2(void) { }
4096#endif 4125#endif
4097 4126
4098#if !defined(CONTIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) 4127#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
4099static struct file_system_type ext3_fs_type = {
4100 .owner = THIS_MODULE,
4101 .name = "ext3",
4102 .get_sb = ext4_get_sb,
4103 .kill_sb = kill_block_super,
4104 .fs_flags = FS_REQUIRES_DEV,
4105};
4106
4107static inline void register_as_ext3(void) 4128static inline void register_as_ext3(void)
4108{ 4129{
4109 int err = register_filesystem(&ext3_fs_type); 4130 int err = register_filesystem(&ext3_fs_type);
@@ -4134,6 +4155,7 @@ static int __init init_ext4_fs(void)
4134{ 4155{
4135 int err; 4156 int err;
4136 4157
4158 ext4_check_flag_values();
4137 err = init_ext4_system_zone(); 4159 err = init_ext4_system_zone();
4138 if (err) 4160 if (err)
4139 return err; 4161 return err;
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index 00740cb32be3..ed9354aff279 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -34,6 +34,7 @@ const struct inode_operations ext4_symlink_inode_operations = {
34 .readlink = generic_readlink, 34 .readlink = generic_readlink,
35 .follow_link = page_follow_link_light, 35 .follow_link = page_follow_link_light,
36 .put_link = page_put_link, 36 .put_link = page_put_link,
37 .setattr = ext4_setattr,
37#ifdef CONFIG_EXT4_FS_XATTR 38#ifdef CONFIG_EXT4_FS_XATTR
38 .setxattr = generic_setxattr, 39 .setxattr = generic_setxattr,
39 .getxattr = generic_getxattr, 40 .getxattr = generic_getxattr,
@@ -45,6 +46,7 @@ const struct inode_operations ext4_symlink_inode_operations = {
45const struct inode_operations ext4_fast_symlink_inode_operations = { 46const struct inode_operations ext4_fast_symlink_inode_operations = {
46 .readlink = generic_readlink, 47 .readlink = generic_readlink,
47 .follow_link = ext4_follow_link, 48 .follow_link = ext4_follow_link,
49 .setattr = ext4_setattr,
48#ifdef CONFIG_EXT4_FS_XATTR 50#ifdef CONFIG_EXT4_FS_XATTR
49 .setxattr = generic_setxattr, 51 .setxattr = generic_setxattr,
50 .getxattr = generic_getxattr, 52 .getxattr = generic_getxattr,
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index b4c5aa8489d8..04338009793a 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -97,7 +97,7 @@ static int ext4_xattr_list(struct dentry *dentry, char *buffer,
97 97
98static struct mb_cache *ext4_xattr_cache; 98static struct mb_cache *ext4_xattr_cache;
99 99
100static struct xattr_handler *ext4_xattr_handler_map[] = { 100static const struct xattr_handler *ext4_xattr_handler_map[] = {
101 [EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler, 101 [EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler,
102#ifdef CONFIG_EXT4_FS_POSIX_ACL 102#ifdef CONFIG_EXT4_FS_POSIX_ACL
103 [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext4_xattr_acl_access_handler, 103 [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext4_xattr_acl_access_handler,
@@ -109,7 +109,7 @@ static struct xattr_handler *ext4_xattr_handler_map[] = {
109#endif 109#endif
110}; 110};
111 111
112struct xattr_handler *ext4_xattr_handlers[] = { 112const struct xattr_handler *ext4_xattr_handlers[] = {
113 &ext4_xattr_user_handler, 113 &ext4_xattr_user_handler,
114 &ext4_xattr_trusted_handler, 114 &ext4_xattr_trusted_handler,
115#ifdef CONFIG_EXT4_FS_POSIX_ACL 115#ifdef CONFIG_EXT4_FS_POSIX_ACL
@@ -122,10 +122,10 @@ struct xattr_handler *ext4_xattr_handlers[] = {
122 NULL 122 NULL
123}; 123};
124 124
125static inline struct xattr_handler * 125static inline const struct xattr_handler *
126ext4_xattr_handler(int name_index) 126ext4_xattr_handler(int name_index)
127{ 127{
128 struct xattr_handler *handler = NULL; 128 const struct xattr_handler *handler = NULL;
129 129
130 if (name_index > 0 && name_index < ARRAY_SIZE(ext4_xattr_handler_map)) 130 if (name_index > 0 && name_index < ARRAY_SIZE(ext4_xattr_handler_map))
131 handler = ext4_xattr_handler_map[name_index]; 131 handler = ext4_xattr_handler_map[name_index];
@@ -228,9 +228,8 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
228 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); 228 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
229 if (ext4_xattr_check_block(bh)) { 229 if (ext4_xattr_check_block(bh)) {
230bad_block: 230bad_block:
231 ext4_error(inode->i_sb, 231 EXT4_ERROR_INODE(inode, "bad block %llu",
232 "inode %lu: bad block %llu", inode->i_ino, 232 EXT4_I(inode)->i_file_acl);
233 EXT4_I(inode)->i_file_acl);
234 error = -EIO; 233 error = -EIO;
235 goto cleanup; 234 goto cleanup;
236 } 235 }
@@ -332,7 +331,7 @@ ext4_xattr_list_entries(struct dentry *dentry, struct ext4_xattr_entry *entry,
332 size_t rest = buffer_size; 331 size_t rest = buffer_size;
333 332
334 for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { 333 for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) {
335 struct xattr_handler *handler = 334 const struct xattr_handler *handler =
336 ext4_xattr_handler(entry->e_name_index); 335 ext4_xattr_handler(entry->e_name_index);
337 336
338 if (handler) { 337 if (handler) {
@@ -372,9 +371,8 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
372 ea_bdebug(bh, "b_count=%d, refcount=%d", 371 ea_bdebug(bh, "b_count=%d, refcount=%d",
373 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); 372 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
374 if (ext4_xattr_check_block(bh)) { 373 if (ext4_xattr_check_block(bh)) {
375 ext4_error(inode->i_sb, 374 EXT4_ERROR_INODE(inode, "bad block %llu",
376 "inode %lu: bad block %llu", inode->i_ino, 375 EXT4_I(inode)->i_file_acl);
377 EXT4_I(inode)->i_file_acl);
378 error = -EIO; 376 error = -EIO;
379 goto cleanup; 377 goto cleanup;
380 } 378 }
@@ -666,8 +664,8 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i,
666 atomic_read(&(bs->bh->b_count)), 664 atomic_read(&(bs->bh->b_count)),
667 le32_to_cpu(BHDR(bs->bh)->h_refcount)); 665 le32_to_cpu(BHDR(bs->bh)->h_refcount));
668 if (ext4_xattr_check_block(bs->bh)) { 666 if (ext4_xattr_check_block(bs->bh)) {
669 ext4_error(sb, "inode %lu: bad block %llu", 667 EXT4_ERROR_INODE(inode, "bad block %llu",
670 inode->i_ino, EXT4_I(inode)->i_file_acl); 668 EXT4_I(inode)->i_file_acl);
671 error = -EIO; 669 error = -EIO;
672 goto cleanup; 670 goto cleanup;
673 } 671 }
@@ -820,7 +818,7 @@ inserted:
820 EXT4_I(inode)->i_block_group); 818 EXT4_I(inode)->i_block_group);
821 819
822 /* non-extent files can't have physical blocks past 2^32 */ 820 /* non-extent files can't have physical blocks past 2^32 */
823 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) 821 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
824 goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; 822 goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
825 823
826 block = ext4_new_meta_blocks(handle, inode, 824 block = ext4_new_meta_blocks(handle, inode,
@@ -828,7 +826,7 @@ inserted:
828 if (error) 826 if (error)
829 goto cleanup; 827 goto cleanup;
830 828
831 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) 829 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
832 BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS); 830 BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS);
833 831
834 ea_idebug(inode, "creating block %d", block); 832 ea_idebug(inode, "creating block %d", block);
@@ -880,8 +878,8 @@ cleanup_dquot:
880 goto cleanup; 878 goto cleanup;
881 879
882bad_block: 880bad_block:
883 ext4_error(inode->i_sb, "inode %lu: bad block %llu", 881 EXT4_ERROR_INODE(inode, "bad block %llu",
884 inode->i_ino, EXT4_I(inode)->i_file_acl); 882 EXT4_I(inode)->i_file_acl);
885 goto cleanup; 883 goto cleanup;
886 884
887#undef header 885#undef header
@@ -1194,8 +1192,8 @@ retry:
1194 if (!bh) 1192 if (!bh)
1195 goto cleanup; 1193 goto cleanup;
1196 if (ext4_xattr_check_block(bh)) { 1194 if (ext4_xattr_check_block(bh)) {
1197 ext4_error(inode->i_sb, "inode %lu: bad block %llu", 1195 EXT4_ERROR_INODE(inode, "bad block %llu",
1198 inode->i_ino, EXT4_I(inode)->i_file_acl); 1196 EXT4_I(inode)->i_file_acl);
1199 error = -EIO; 1197 error = -EIO;
1200 goto cleanup; 1198 goto cleanup;
1201 } 1199 }
@@ -1372,14 +1370,14 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
1372 goto cleanup; 1370 goto cleanup;
1373 bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); 1371 bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
1374 if (!bh) { 1372 if (!bh) {
1375 ext4_error(inode->i_sb, "inode %lu: block %llu read error", 1373 EXT4_ERROR_INODE(inode, "block %llu read error",
1376 inode->i_ino, EXT4_I(inode)->i_file_acl); 1374 EXT4_I(inode)->i_file_acl);
1377 goto cleanup; 1375 goto cleanup;
1378 } 1376 }
1379 if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || 1377 if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
1380 BHDR(bh)->h_blocks != cpu_to_le32(1)) { 1378 BHDR(bh)->h_blocks != cpu_to_le32(1)) {
1381 ext4_error(inode->i_sb, "inode %lu: bad block %llu", 1379 EXT4_ERROR_INODE(inode, "bad block %llu",
1382 inode->i_ino, EXT4_I(inode)->i_file_acl); 1380 EXT4_I(inode)->i_file_acl);
1383 goto cleanup; 1381 goto cleanup;
1384 } 1382 }
1385 ext4_xattr_release_block(handle, inode, bh); 1383 ext4_xattr_release_block(handle, inode, bh);
@@ -1504,9 +1502,8 @@ again:
1504 } 1502 }
1505 bh = sb_bread(inode->i_sb, ce->e_block); 1503 bh = sb_bread(inode->i_sb, ce->e_block);
1506 if (!bh) { 1504 if (!bh) {
1507 ext4_error(inode->i_sb, 1505 EXT4_ERROR_INODE(inode, "block %lu read error",
1508 "inode %lu: block %lu read error", 1506 (unsigned long) ce->e_block);
1509 inode->i_ino, (unsigned long) ce->e_block);
1510 } else if (le32_to_cpu(BHDR(bh)->h_refcount) >= 1507 } else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
1511 EXT4_XATTR_REFCOUNT_MAX) { 1508 EXT4_XATTR_REFCOUNT_MAX) {
1512 ea_idebug(inode, "block %lu refcount %d>=%d", 1509 ea_idebug(inode, "block %lu refcount %d>=%d",
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 8ede88b18c29..518e96e43905 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -65,11 +65,11 @@ struct ext4_xattr_entry {
65 65
66# ifdef CONFIG_EXT4_FS_XATTR 66# ifdef CONFIG_EXT4_FS_XATTR
67 67
68extern struct xattr_handler ext4_xattr_user_handler; 68extern const struct xattr_handler ext4_xattr_user_handler;
69extern struct xattr_handler ext4_xattr_trusted_handler; 69extern const struct xattr_handler ext4_xattr_trusted_handler;
70extern struct xattr_handler ext4_xattr_acl_access_handler; 70extern const struct xattr_handler ext4_xattr_acl_access_handler;
71extern struct xattr_handler ext4_xattr_acl_default_handler; 71extern const struct xattr_handler ext4_xattr_acl_default_handler;
72extern struct xattr_handler ext4_xattr_security_handler; 72extern const struct xattr_handler ext4_xattr_security_handler;
73 73
74extern ssize_t ext4_listxattr(struct dentry *, char *, size_t); 74extern ssize_t ext4_listxattr(struct dentry *, char *, size_t);
75 75
@@ -86,7 +86,7 @@ extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
86extern int init_ext4_xattr(void); 86extern int init_ext4_xattr(void);
87extern void exit_ext4_xattr(void); 87extern void exit_ext4_xattr(void);
88 88
89extern struct xattr_handler *ext4_xattr_handlers[]; 89extern const struct xattr_handler *ext4_xattr_handlers[];
90 90
91# else /* CONFIG_EXT4_FS_XATTR */ 91# else /* CONFIG_EXT4_FS_XATTR */
92 92
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index 983c253999a7..9b21268e121c 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -7,6 +7,7 @@
7#include <linux/string.h> 7#include <linux/string.h>
8#include <linux/fs.h> 8#include <linux/fs.h>
9#include <linux/security.h> 9#include <linux/security.h>
10#include <linux/slab.h>
10#include "ext4_jbd2.h" 11#include "ext4_jbd2.h"
11#include "ext4.h" 12#include "ext4.h"
12#include "xattr.h" 13#include "xattr.h"
@@ -68,7 +69,7 @@ ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir)
68 return err; 69 return err;
69} 70}
70 71
71struct xattr_handler ext4_xattr_security_handler = { 72const struct xattr_handler ext4_xattr_security_handler = {
72 .prefix = XATTR_SECURITY_PREFIX, 73 .prefix = XATTR_SECURITY_PREFIX,
73 .list = ext4_xattr_security_list, 74 .list = ext4_xattr_security_list,
74 .get = ext4_xattr_security_get, 75 .get = ext4_xattr_security_get,
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
index 15b50edc6587..37e6ebca2cc3 100644
--- a/fs/ext4/xattr_trusted.c
+++ b/fs/ext4/xattr_trusted.c
@@ -51,7 +51,7 @@ ext4_xattr_trusted_set(struct dentry *dentry, const char *name,
51 name, value, size, flags); 51 name, value, size, flags);
52} 52}
53 53
54struct xattr_handler ext4_xattr_trusted_handler = { 54const struct xattr_handler ext4_xattr_trusted_handler = {
55 .prefix = XATTR_TRUSTED_PREFIX, 55 .prefix = XATTR_TRUSTED_PREFIX,
56 .list = ext4_xattr_trusted_list, 56 .list = ext4_xattr_trusted_list,
57 .get = ext4_xattr_trusted_get, 57 .get = ext4_xattr_trusted_get,
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
index c4ce05746ce1..98c375352d0e 100644
--- a/fs/ext4/xattr_user.c
+++ b/fs/ext4/xattr_user.c
@@ -54,7 +54,7 @@ ext4_xattr_user_set(struct dentry *dentry, const char *name,
54 name, value, size, flags); 54 name, value, size, flags);
55} 55}
56 56
57struct xattr_handler ext4_xattr_user_handler = { 57const struct xattr_handler ext4_xattr_user_handler = {
58 .prefix = XATTR_USER_PREFIX, 58 .prefix = XATTR_USER_PREFIX,
59 .list = ext4_xattr_user_list, 59 .list = ext4_xattr_user_list,
60 .get = ext4_xattr_user_get, 60 .get = ext4_xattr_user_get,
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 923990e4f16e..ae8200f84e39 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -9,6 +9,7 @@
9 */ 9 */
10 10
11#include <linux/fs.h> 11#include <linux/fs.h>
12#include <linux/slab.h>
12#include <linux/buffer_head.h> 13#include <linux/buffer_head.h>
13#include "fat.h" 14#include "fat.h"
14 15
@@ -241,9 +242,10 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus)
241 while (*fclus < cluster) { 242 while (*fclus < cluster) {
242 /* prevent the infinite loop of cluster chain */ 243 /* prevent the infinite loop of cluster chain */
243 if (*fclus > limit) { 244 if (*fclus > limit) {
244 fat_fs_error(sb, "%s: detected the cluster chain loop" 245 fat_fs_error_ratelimit(sb,
245 " (i_pos %lld)", __func__, 246 "%s: detected the cluster chain loop"
246 MSDOS_I(inode)->i_pos); 247 " (i_pos %lld)", __func__,
248 MSDOS_I(inode)->i_pos);
247 nr = -EIO; 249 nr = -EIO;
248 goto out; 250 goto out;
249 } 251 }
@@ -252,9 +254,9 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus)
252 if (nr < 0) 254 if (nr < 0)
253 goto out; 255 goto out;
254 else if (nr == FAT_ENT_FREE) { 256 else if (nr == FAT_ENT_FREE) {
255 fat_fs_error(sb, "%s: invalid cluster chain" 257 fat_fs_error_ratelimit(sb, "%s: invalid cluster chain"
256 " (i_pos %lld)", __func__, 258 " (i_pos %lld)", __func__,
257 MSDOS_I(inode)->i_pos); 259 MSDOS_I(inode)->i_pos);
258 nr = -EIO; 260 nr = -EIO;
259 goto out; 261 goto out;
260 } else if (nr == FAT_ENT_EOF) { 262 } else if (nr == FAT_ENT_EOF) {
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 530b4ca01510..ee42b9e0b16a 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -19,6 +19,7 @@
19#include <linux/buffer_head.h> 19#include <linux/buffer_head.h>
20#include <linux/compat.h> 20#include <linux/compat.h>
21#include <asm/uaccess.h> 21#include <asm/uaccess.h>
22#include <linux/kernel.h>
22#include "fat.h" 23#include "fat.h"
23 24
24/* 25/*
@@ -140,28 +141,22 @@ static int uni16_to_x8(unsigned char *ascii, const wchar_t *uni, int len,
140{ 141{
141 const wchar_t *ip; 142 const wchar_t *ip;
142 wchar_t ec; 143 wchar_t ec;
143 unsigned char *op, nc; 144 unsigned char *op;
144 int charlen; 145 int charlen;
145 int k;
146 146
147 ip = uni; 147 ip = uni;
148 op = ascii; 148 op = ascii;
149 149
150 while (*ip && ((len - NLS_MAX_CHARSET_SIZE) > 0)) { 150 while (*ip && ((len - NLS_MAX_CHARSET_SIZE) > 0)) {
151 ec = *ip++; 151 ec = *ip++;
152 if ( (charlen = nls->uni2char(ec, op, NLS_MAX_CHARSET_SIZE)) > 0) { 152 if ((charlen = nls->uni2char(ec, op, NLS_MAX_CHARSET_SIZE)) > 0) {
153 op += charlen; 153 op += charlen;
154 len -= charlen; 154 len -= charlen;
155 } else { 155 } else {
156 if (uni_xlate == 1) { 156 if (uni_xlate == 1) {
157 *op = ':'; 157 *op++ = ':';
158 for (k = 4; k > 0; k--) { 158 op = pack_hex_byte(op, ec >> 8);
159 nc = ec & 0xF; 159 op = pack_hex_byte(op, ec);
160 op[k] = nc > 9 ? nc + ('a' - 10)
161 : nc + '0';
162 ec >>= 4;
163 }
164 op += 5;
165 len -= 5; 160 len -= 5;
166 } else { 161 } else {
167 *op++ = '?'; 162 *op++ = '?';
@@ -758,9 +753,10 @@ static int fat_ioctl_readdir(struct inode *inode, struct file *filp,
758 return ret; 753 return ret;
759} 754}
760 755
761static int fat_dir_ioctl(struct inode *inode, struct file *filp, 756static long fat_dir_ioctl(struct file *filp, unsigned int cmd,
762 unsigned int cmd, unsigned long arg) 757 unsigned long arg)
763{ 758{
759 struct inode *inode = filp->f_path.dentry->d_inode;
764 struct __fat_dirent __user *d1 = (struct __fat_dirent __user *)arg; 760 struct __fat_dirent __user *d1 = (struct __fat_dirent __user *)arg;
765 int short_only, both; 761 int short_only, both;
766 762
@@ -774,7 +770,7 @@ static int fat_dir_ioctl(struct inode *inode, struct file *filp,
774 both = 1; 770 both = 1;
775 break; 771 break;
776 default: 772 default:
777 return fat_generic_ioctl(inode, filp, cmd, arg); 773 return fat_generic_ioctl(filp, cmd, arg);
778 } 774 }
779 775
780 if (!access_ok(VERIFY_WRITE, d1, sizeof(struct __fat_dirent[2]))) 776 if (!access_ok(VERIFY_WRITE, d1, sizeof(struct __fat_dirent[2])))
@@ -814,7 +810,7 @@ static long fat_compat_dir_ioctl(struct file *filp, unsigned cmd,
814 both = 1; 810 both = 1;
815 break; 811 break;
816 default: 812 default:
817 return -ENOIOCTLCMD; 813 return fat_generic_ioctl(filp, cmd, (unsigned long)arg);
818 } 814 }
819 815
820 if (!access_ok(VERIFY_WRITE, d1, sizeof(struct compat_dirent[2]))) 816 if (!access_ok(VERIFY_WRITE, d1, sizeof(struct compat_dirent[2])))
@@ -836,7 +832,7 @@ const struct file_operations fat_dir_operations = {
836 .llseek = generic_file_llseek, 832 .llseek = generic_file_llseek,
837 .read = generic_read_dir, 833 .read = generic_read_dir,
838 .readdir = fat_readdir, 834 .readdir = fat_readdir,
839 .ioctl = fat_dir_ioctl, 835 .unlocked_ioctl = fat_dir_ioctl,
840#ifdef CONFIG_COMPAT 836#ifdef CONFIG_COMPAT
841 .compat_ioctl = fat_compat_dir_ioctl, 837 .compat_ioctl = fat_compat_dir_ioctl,
842#endif 838#endif
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index e6efdfa0f6db..27ac25725954 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -6,6 +6,7 @@
6#include <linux/nls.h> 6#include <linux/nls.h>
7#include <linux/fs.h> 7#include <linux/fs.h>
8#include <linux/mutex.h> 8#include <linux/mutex.h>
9#include <linux/ratelimit.h>
9#include <linux/msdos_fs.h> 10#include <linux/msdos_fs.h>
10 11
11/* 12/*
@@ -82,6 +83,8 @@ struct msdos_sb_info {
82 struct fatent_operations *fatent_ops; 83 struct fatent_operations *fatent_ops;
83 struct inode *fat_inode; 84 struct inode *fat_inode;
84 85
86 struct ratelimit_state ratelimit;
87
85 spinlock_t inode_hash_lock; 88 spinlock_t inode_hash_lock;
86 struct hlist_head inode_hashtable[FAT_HASH_SIZE]; 89 struct hlist_head inode_hashtable[FAT_HASH_SIZE];
87}; 90};
@@ -298,16 +301,16 @@ extern int fat_free_clusters(struct inode *inode, int cluster);
298extern int fat_count_free_clusters(struct super_block *sb); 301extern int fat_count_free_clusters(struct super_block *sb);
299 302
300/* fat/file.c */ 303/* fat/file.c */
301extern int fat_generic_ioctl(struct inode *inode, struct file *filp, 304extern long fat_generic_ioctl(struct file *filp, unsigned int cmd,
302 unsigned int cmd, unsigned long arg); 305 unsigned long arg);
303extern const struct file_operations fat_file_operations; 306extern const struct file_operations fat_file_operations;
304extern const struct inode_operations fat_file_inode_operations; 307extern const struct inode_operations fat_file_inode_operations;
305extern int fat_setattr(struct dentry * dentry, struct iattr * attr); 308extern int fat_setattr(struct dentry * dentry, struct iattr * attr);
306extern void fat_truncate(struct inode *inode); 309extern int fat_setsize(struct inode *inode, loff_t offset);
310extern void fat_truncate_blocks(struct inode *inode, loff_t offset);
307extern int fat_getattr(struct vfsmount *mnt, struct dentry *dentry, 311extern int fat_getattr(struct vfsmount *mnt, struct dentry *dentry,
308 struct kstat *stat); 312 struct kstat *stat);
309extern int fat_file_fsync(struct file *file, struct dentry *dentry, 313extern int fat_file_fsync(struct file *file, int datasync);
310 int datasync);
311 314
312/* fat/inode.c */ 315/* fat/inode.c */
313extern void fat_attach(struct inode *inode, loff_t i_pos); 316extern void fat_attach(struct inode *inode, loff_t i_pos);
@@ -322,8 +325,13 @@ extern int fat_fill_super(struct super_block *sb, void *data, int silent,
322extern int fat_flush_inodes(struct super_block *sb, struct inode *i1, 325extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
323 struct inode *i2); 326 struct inode *i2);
324/* fat/misc.c */ 327/* fat/misc.c */
325extern void fat_fs_error(struct super_block *s, const char *fmt, ...) 328extern void
326 __attribute__ ((format (printf, 2, 3))) __cold; 329__fat_fs_error(struct super_block *s, int report, const char *fmt, ...)
330 __attribute__ ((format (printf, 3, 4))) __cold;
331#define fat_fs_error(s, fmt, args...) \
332 __fat_fs_error(s, 1, fmt , ## args)
333#define fat_fs_error_ratelimit(s, fmt, args...) \
334 __fat_fs_error(s, __ratelimit(&MSDOS_SB(s)->ratelimit), fmt , ## args)
327extern int fat_clusters_flush(struct super_block *sb); 335extern int fat_clusters_flush(struct super_block *sb);
328extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster); 336extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster);
329extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts, 337extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts,
diff --git a/fs/fat/file.c b/fs/fat/file.c
index e8c159de236b..990dfae022e5 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -8,6 +8,7 @@
8 8
9#include <linux/capability.h> 9#include <linux/capability.h>
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/compat.h>
11#include <linux/mount.h> 12#include <linux/mount.h>
12#include <linux/time.h> 13#include <linux/time.h>
13#include <linux/buffer_head.h> 14#include <linux/buffer_head.h>
@@ -114,9 +115,9 @@ out:
114 return err; 115 return err;
115} 116}
116 117
117int fat_generic_ioctl(struct inode *inode, struct file *filp, 118long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
118 unsigned int cmd, unsigned long arg)
119{ 119{
120 struct inode *inode = filp->f_path.dentry->d_inode;
120 u32 __user *user_attr = (u32 __user *)arg; 121 u32 __user *user_attr = (u32 __user *)arg;
121 122
122 switch (cmd) { 123 switch (cmd) {
@@ -129,6 +130,15 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp,
129 } 130 }
130} 131}
131 132
133#ifdef CONFIG_COMPAT
134static long fat_generic_compat_ioctl(struct file *filp, unsigned int cmd,
135 unsigned long arg)
136
137{
138 return fat_generic_ioctl(filp, cmd, (unsigned long)compat_ptr(arg));
139}
140#endif
141
132static int fat_file_release(struct inode *inode, struct file *filp) 142static int fat_file_release(struct inode *inode, struct file *filp)
133{ 143{
134 if ((filp->f_mode & FMODE_WRITE) && 144 if ((filp->f_mode & FMODE_WRITE) &&
@@ -139,12 +149,12 @@ static int fat_file_release(struct inode *inode, struct file *filp)
139 return 0; 149 return 0;
140} 150}
141 151
142int fat_file_fsync(struct file *filp, struct dentry *dentry, int datasync) 152int fat_file_fsync(struct file *filp, int datasync)
143{ 153{
144 struct inode *inode = dentry->d_inode; 154 struct inode *inode = filp->f_mapping->host;
145 int res, err; 155 int res, err;
146 156
147 res = simple_fsync(filp, dentry, datasync); 157 res = generic_file_fsync(filp, datasync);
148 err = sync_mapping_buffers(MSDOS_SB(inode->i_sb)->fat_inode->i_mapping); 158 err = sync_mapping_buffers(MSDOS_SB(inode->i_sb)->fat_inode->i_mapping);
149 159
150 return res ? res : err; 160 return res ? res : err;
@@ -159,7 +169,10 @@ const struct file_operations fat_file_operations = {
159 .aio_write = generic_file_aio_write, 169 .aio_write = generic_file_aio_write,
160 .mmap = generic_file_mmap, 170 .mmap = generic_file_mmap,
161 .release = fat_file_release, 171 .release = fat_file_release,
162 .ioctl = fat_generic_ioctl, 172 .unlocked_ioctl = fat_generic_ioctl,
173#ifdef CONFIG_COMPAT
174 .compat_ioctl = fat_generic_compat_ioctl,
175#endif
163 .fsync = fat_file_fsync, 176 .fsync = fat_file_fsync,
164 .splice_read = generic_file_splice_read, 177 .splice_read = generic_file_splice_read,
165}; 178};
@@ -270,7 +283,7 @@ static int fat_free(struct inode *inode, int skip)
270 return fat_free_clusters(inode, free_start); 283 return fat_free_clusters(inode, free_start);
271} 284}
272 285
273void fat_truncate(struct inode *inode) 286void fat_truncate_blocks(struct inode *inode, loff_t offset)
274{ 287{
275 struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); 288 struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
276 const unsigned int cluster_size = sbi->cluster_size; 289 const unsigned int cluster_size = sbi->cluster_size;
@@ -280,10 +293,10 @@ void fat_truncate(struct inode *inode)
280 * This protects against truncating a file bigger than it was then 293 * This protects against truncating a file bigger than it was then
281 * trying to write into the hole. 294 * trying to write into the hole.
282 */ 295 */
283 if (MSDOS_I(inode)->mmu_private > inode->i_size) 296 if (MSDOS_I(inode)->mmu_private > offset)
284 MSDOS_I(inode)->mmu_private = inode->i_size; 297 MSDOS_I(inode)->mmu_private = offset;
285 298
286 nr_clusters = (inode->i_size + (cluster_size - 1)) >> sbi->cluster_bits; 299 nr_clusters = (offset + (cluster_size - 1)) >> sbi->cluster_bits;
287 300
288 fat_free(inode, nr_clusters); 301 fat_free(inode, nr_clusters);
289 fat_flush_inodes(inode->i_sb, inode, NULL); 302 fat_flush_inodes(inode->i_sb, inode, NULL);
@@ -351,6 +364,18 @@ static int fat_allow_set_time(struct msdos_sb_info *sbi, struct inode *inode)
351 return 0; 364 return 0;
352} 365}
353 366
367int fat_setsize(struct inode *inode, loff_t offset)
368{
369 int error;
370
371 error = simple_setsize(inode, offset);
372 if (error)
373 return error;
374 fat_truncate_blocks(inode, offset);
375
376 return error;
377}
378
354#define TIMES_SET_FLAGS (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET) 379#define TIMES_SET_FLAGS (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)
355/* valid file mode bits */ 380/* valid file mode bits */
356#define FAT_VALID_MODE (S_IFREG | S_IFDIR | S_IRWXUGO) 381#define FAT_VALID_MODE (S_IFREG | S_IFDIR | S_IRWXUGO)
@@ -365,7 +390,8 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
365 /* 390 /*
366 * Expand the file. Since inode_setattr() updates ->i_size 391 * Expand the file. Since inode_setattr() updates ->i_size
367 * before calling the ->truncate(), but FAT needs to fill the 392 * before calling the ->truncate(), but FAT needs to fill the
368 * hole before it. 393 * hole before it. XXX: this is no longer true with new truncate
394 * sequence.
369 */ 395 */
370 if (attr->ia_valid & ATTR_SIZE) { 396 if (attr->ia_valid & ATTR_SIZE) {
371 if (attr->ia_size > inode->i_size) { 397 if (attr->ia_size > inode->i_size) {
@@ -414,15 +440,20 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
414 attr->ia_valid &= ~ATTR_MODE; 440 attr->ia_valid &= ~ATTR_MODE;
415 } 441 }
416 442
417 if (attr->ia_valid) 443 if (attr->ia_valid & ATTR_SIZE) {
418 error = inode_setattr(inode, attr); 444 error = fat_setsize(inode, attr->ia_size);
445 if (error)
446 goto out;
447 }
448
449 generic_setattr(inode, attr);
450 mark_inode_dirty(inode);
419out: 451out:
420 return error; 452 return error;
421} 453}
422EXPORT_SYMBOL_GPL(fat_setattr); 454EXPORT_SYMBOL_GPL(fat_setattr);
423 455
424const struct inode_operations fat_file_inode_operations = { 456const struct inode_operations fat_file_inode_operations = {
425 .truncate = fat_truncate,
426 .setattr = fat_setattr, 457 .setattr = fat_setattr,
427 .getattr = fat_getattr, 458 .getattr = fat_getattr,
428}; 459};
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 0ce143bd7d56..7bf45aee56d7 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -142,14 +142,29 @@ static int fat_readpages(struct file *file, struct address_space *mapping,
142 return mpage_readpages(mapping, pages, nr_pages, fat_get_block); 142 return mpage_readpages(mapping, pages, nr_pages, fat_get_block);
143} 143}
144 144
145static void fat_write_failed(struct address_space *mapping, loff_t to)
146{
147 struct inode *inode = mapping->host;
148
149 if (to > inode->i_size) {
150 truncate_pagecache(inode, to, inode->i_size);
151 fat_truncate_blocks(inode, inode->i_size);
152 }
153}
154
145static int fat_write_begin(struct file *file, struct address_space *mapping, 155static int fat_write_begin(struct file *file, struct address_space *mapping,
146 loff_t pos, unsigned len, unsigned flags, 156 loff_t pos, unsigned len, unsigned flags,
147 struct page **pagep, void **fsdata) 157 struct page **pagep, void **fsdata)
148{ 158{
159 int err;
160
149 *pagep = NULL; 161 *pagep = NULL;
150 return cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 162 err = cont_write_begin_newtrunc(file, mapping, pos, len, flags,
151 fat_get_block, 163 pagep, fsdata, fat_get_block,
152 &MSDOS_I(mapping->host)->mmu_private); 164 &MSDOS_I(mapping->host)->mmu_private);
165 if (err < 0)
166 fat_write_failed(mapping, pos + len);
167 return err;
153} 168}
154 169
155static int fat_write_end(struct file *file, struct address_space *mapping, 170static int fat_write_end(struct file *file, struct address_space *mapping,
@@ -159,6 +174,8 @@ static int fat_write_end(struct file *file, struct address_space *mapping,
159 struct inode *inode = mapping->host; 174 struct inode *inode = mapping->host;
160 int err; 175 int err;
161 err = generic_write_end(file, mapping, pos, len, copied, pagep, fsdata); 176 err = generic_write_end(file, mapping, pos, len, copied, pagep, fsdata);
177 if (err < len)
178 fat_write_failed(mapping, pos + len);
162 if (!(err < 0) && !(MSDOS_I(inode)->i_attrs & ATTR_ARCH)) { 179 if (!(err < 0) && !(MSDOS_I(inode)->i_attrs & ATTR_ARCH)) {
163 inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; 180 inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
164 MSDOS_I(inode)->i_attrs |= ATTR_ARCH; 181 MSDOS_I(inode)->i_attrs |= ATTR_ARCH;
@@ -172,7 +189,9 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
172 loff_t offset, unsigned long nr_segs) 189 loff_t offset, unsigned long nr_segs)
173{ 190{
174 struct file *file = iocb->ki_filp; 191 struct file *file = iocb->ki_filp;
175 struct inode *inode = file->f_mapping->host; 192 struct address_space *mapping = file->f_mapping;
193 struct inode *inode = mapping->host;
194 ssize_t ret;
176 195
177 if (rw == WRITE) { 196 if (rw == WRITE) {
178 /* 197 /*
@@ -193,8 +212,12 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
193 * FAT need to use the DIO_LOCKING for avoiding the race 212 * FAT need to use the DIO_LOCKING for avoiding the race
194 * condition of fat_get_block() and ->truncate(). 213 * condition of fat_get_block() and ->truncate().
195 */ 214 */
196 return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 215 ret = blockdev_direct_IO_newtrunc(rw, iocb, inode, inode->i_sb->s_bdev,
197 offset, nr_segs, fat_get_block, NULL); 216 iov, offset, nr_segs, fat_get_block, NULL);
217 if (ret < 0 && (rw & WRITE))
218 fat_write_failed(mapping, offset + iov_length(iov, nr_segs));
219
220 return ret;
198} 221}
199 222
200static sector_t _fat_bmap(struct address_space *mapping, sector_t block) 223static sector_t _fat_bmap(struct address_space *mapping, sector_t block)
@@ -429,7 +452,7 @@ static void fat_delete_inode(struct inode *inode)
429{ 452{
430 truncate_inode_pages(&inode->i_data, 0); 453 truncate_inode_pages(&inode->i_data, 0);
431 inode->i_size = 0; 454 inode->i_size = 0;
432 fat_truncate(inode); 455 fat_truncate_blocks(inode, 0);
433 clear_inode(inode); 456 clear_inode(inode);
434} 457}
435 458
@@ -1250,6 +1273,8 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
1250 sb->s_op = &fat_sops; 1273 sb->s_op = &fat_sops;
1251 sb->s_export_op = &fat_export_ops; 1274 sb->s_export_op = &fat_export_ops;
1252 sbi->dir_ops = fs_dir_inode_ops; 1275 sbi->dir_ops = fs_dir_inode_ops;
1276 ratelimit_state_init(&sbi->ratelimit, DEFAULT_RATELIMIT_INTERVAL,
1277 DEFAULT_RATELIMIT_BURST);
1253 1278
1254 error = parse_options(data, isvfat, silent, &debug, &sbi->options); 1279 error = parse_options(data, isvfat, silent, &debug, &sbi->options);
1255 if (error) 1280 if (error)
@@ -1497,10 +1522,8 @@ out_fail:
1497 iput(fat_inode); 1522 iput(fat_inode);
1498 if (root_inode) 1523 if (root_inode)
1499 iput(root_inode); 1524 iput(root_inode);
1500 if (sbi->nls_io) 1525 unload_nls(sbi->nls_io);
1501 unload_nls(sbi->nls_io); 1526 unload_nls(sbi->nls_disk);
1502 if (sbi->nls_disk)
1503 unload_nls(sbi->nls_disk);
1504 if (sbi->options.iocharset != fat_default_iocharset) 1527 if (sbi->options.iocharset != fat_default_iocharset)
1505 kfree(sbi->options.iocharset); 1528 kfree(sbi->options.iocharset);
1506 sb->s_fs_info = NULL; 1529 sb->s_fs_info = NULL;
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index d3da05f26465..1fa23f6ffba5 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -20,27 +20,29 @@
20 * In case the file system is remounted read-only, it can be made writable 20 * In case the file system is remounted read-only, it can be made writable
21 * again by remounting it. 21 * again by remounting it.
22 */ 22 */
23void fat_fs_error(struct super_block *s, const char *fmt, ...) 23void __fat_fs_error(struct super_block *s, int report, const char *fmt, ...)
24{ 24{
25 struct fat_mount_options *opts = &MSDOS_SB(s)->options; 25 struct fat_mount_options *opts = &MSDOS_SB(s)->options;
26 va_list args; 26 va_list args;
27 27
28 printk(KERN_ERR "FAT: Filesystem error (dev %s)\n", s->s_id); 28 if (report) {
29 printk(KERN_ERR "FAT: Filesystem error (dev %s)\n", s->s_id);
29 30
30 printk(KERN_ERR " "); 31 printk(KERN_ERR " ");
31 va_start(args, fmt); 32 va_start(args, fmt);
32 vprintk(fmt, args); 33 vprintk(fmt, args);
33 va_end(args); 34 va_end(args);
34 printk("\n"); 35 printk("\n");
36 }
35 37
36 if (opts->errors == FAT_ERRORS_PANIC) 38 if (opts->errors == FAT_ERRORS_PANIC)
37 panic(" FAT fs panic from previous error\n"); 39 panic("FAT: fs panic from previous error\n");
38 else if (opts->errors == FAT_ERRORS_RO && !(s->s_flags & MS_RDONLY)) { 40 else if (opts->errors == FAT_ERRORS_RO && !(s->s_flags & MS_RDONLY)) {
39 s->s_flags |= MS_RDONLY; 41 s->s_flags |= MS_RDONLY;
40 printk(KERN_ERR " File system has been set read-only\n"); 42 printk(KERN_ERR "FAT: Filesystem has been set read-only\n");
41 } 43 }
42} 44}
43EXPORT_SYMBOL_GPL(fat_fs_error); 45EXPORT_SYMBOL_GPL(__fat_fs_error);
44 46
45/* Flushes the number of free clusters on FAT32 */ 47/* Flushes the number of free clusters on FAT32 */
46/* XXX: Need to write one per FSINFO block. Currently only writes 1 */ 48/* XXX: Need to write one per FSINFO block. Currently only writes 1 */
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index c1ef50154868..6fcc7e71fbaa 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -309,7 +309,7 @@ static int vfat_create_shortname(struct inode *dir, struct nls_table *nls,
309{ 309{
310 struct fat_mount_options *opts = &MSDOS_SB(dir->i_sb)->options; 310 struct fat_mount_options *opts = &MSDOS_SB(dir->i_sb)->options;
311 wchar_t *ip, *ext_start, *end, *name_start; 311 wchar_t *ip, *ext_start, *end, *name_start;
312 unsigned char base[9], ext[4], buf[8], *p; 312 unsigned char base[9], ext[4], buf[5], *p;
313 unsigned char charbuf[NLS_MAX_CHARSET_SIZE]; 313 unsigned char charbuf[NLS_MAX_CHARSET_SIZE];
314 int chl, chi; 314 int chl, chi;
315 int sz = 0, extlen, baselen, i, numtail_baselen, numtail2_baselen; 315 int sz = 0, extlen, baselen, i, numtail_baselen, numtail2_baselen;
@@ -467,7 +467,7 @@ static int vfat_create_shortname(struct inode *dir, struct nls_table *nls,
467 return 0; 467 return 0;
468 } 468 }
469 469
470 i = jiffies & 0xffff; 470 i = jiffies;
471 sz = (jiffies >> 16) & 0x7; 471 sz = (jiffies >> 16) & 0x7;
472 if (baselen > 2) { 472 if (baselen > 2) {
473 baselen = numtail2_baselen; 473 baselen = numtail2_baselen;
@@ -476,7 +476,7 @@ static int vfat_create_shortname(struct inode *dir, struct nls_table *nls,
476 name_res[baselen + 4] = '~'; 476 name_res[baselen + 4] = '~';
477 name_res[baselen + 5] = '1' + sz; 477 name_res[baselen + 5] = '1' + sz;
478 while (1) { 478 while (1) {
479 sprintf(buf, "%04X", i); 479 snprintf(buf, sizeof(buf), "%04X", i & 0xffff);
480 memcpy(&name_res[baselen], buf, 4); 480 memcpy(&name_res[baselen], buf, 4);
481 if (vfat_find_form(dir, name_res) < 0) 481 if (vfat_find_form(dir, name_res) < 0)
482 break; 482 break;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 452d02f9075e..f74d270ba155 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -14,6 +14,7 @@
14#include <linux/dnotify.h> 14#include <linux/dnotify.h>
15#include <linux/slab.h> 15#include <linux/slab.h>
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/pipe_fs_i.h>
17#include <linux/security.h> 18#include <linux/security.h>
18#include <linux/ptrace.h> 19#include <linux/ptrace.h>
19#include <linux/signal.h> 20#include <linux/signal.h>
@@ -412,6 +413,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
412 case F_NOTIFY: 413 case F_NOTIFY:
413 err = fcntl_dirnotify(fd, filp, arg); 414 err = fcntl_dirnotify(fd, filp, arg);
414 break; 415 break;
416 case F_SETPIPE_SZ:
417 case F_GETPIPE_SZ:
418 err = pipe_fcntl(filp, cmd, arg);
419 break;
415 default: 420 default:
416 break; 421 break;
417 } 422 }
@@ -614,9 +619,15 @@ int send_sigurg(struct fown_struct *fown)
614 return ret; 619 return ret;
615} 620}
616 621
617static DEFINE_RWLOCK(fasync_lock); 622static DEFINE_SPINLOCK(fasync_lock);
618static struct kmem_cache *fasync_cache __read_mostly; 623static struct kmem_cache *fasync_cache __read_mostly;
619 624
625static void fasync_free_rcu(struct rcu_head *head)
626{
627 kmem_cache_free(fasync_cache,
628 container_of(head, struct fasync_struct, fa_rcu));
629}
630
620/* 631/*
621 * Remove a fasync entry. If successfully removed, return 632 * Remove a fasync entry. If successfully removed, return
622 * positive and clear the FASYNC flag. If no entry exists, 633 * positive and clear the FASYNC flag. If no entry exists,
@@ -625,8 +636,6 @@ static struct kmem_cache *fasync_cache __read_mostly;
625 * NOTE! It is very important that the FASYNC flag always 636 * NOTE! It is very important that the FASYNC flag always
626 * match the state "is the filp on a fasync list". 637 * match the state "is the filp on a fasync list".
627 * 638 *
628 * We always take the 'filp->f_lock', in since fasync_lock
629 * needs to be irq-safe.
630 */ 639 */
631static int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp) 640static int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
632{ 641{
@@ -634,17 +643,22 @@ static int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
634 int result = 0; 643 int result = 0;
635 644
636 spin_lock(&filp->f_lock); 645 spin_lock(&filp->f_lock);
637 write_lock_irq(&fasync_lock); 646 spin_lock(&fasync_lock);
638 for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) { 647 for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) {
639 if (fa->fa_file != filp) 648 if (fa->fa_file != filp)
640 continue; 649 continue;
650
651 spin_lock_irq(&fa->fa_lock);
652 fa->fa_file = NULL;
653 spin_unlock_irq(&fa->fa_lock);
654
641 *fp = fa->fa_next; 655 *fp = fa->fa_next;
642 kmem_cache_free(fasync_cache, fa); 656 call_rcu(&fa->fa_rcu, fasync_free_rcu);
643 filp->f_flags &= ~FASYNC; 657 filp->f_flags &= ~FASYNC;
644 result = 1; 658 result = 1;
645 break; 659 break;
646 } 660 }
647 write_unlock_irq(&fasync_lock); 661 spin_unlock(&fasync_lock);
648 spin_unlock(&filp->f_lock); 662 spin_unlock(&filp->f_lock);
649 return result; 663 return result;
650} 664}
@@ -666,25 +680,30 @@ static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fa
666 return -ENOMEM; 680 return -ENOMEM;
667 681
668 spin_lock(&filp->f_lock); 682 spin_lock(&filp->f_lock);
669 write_lock_irq(&fasync_lock); 683 spin_lock(&fasync_lock);
670 for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) { 684 for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) {
671 if (fa->fa_file != filp) 685 if (fa->fa_file != filp)
672 continue; 686 continue;
687
688 spin_lock_irq(&fa->fa_lock);
673 fa->fa_fd = fd; 689 fa->fa_fd = fd;
690 spin_unlock_irq(&fa->fa_lock);
691
674 kmem_cache_free(fasync_cache, new); 692 kmem_cache_free(fasync_cache, new);
675 goto out; 693 goto out;
676 } 694 }
677 695
696 spin_lock_init(&new->fa_lock);
678 new->magic = FASYNC_MAGIC; 697 new->magic = FASYNC_MAGIC;
679 new->fa_file = filp; 698 new->fa_file = filp;
680 new->fa_fd = fd; 699 new->fa_fd = fd;
681 new->fa_next = *fapp; 700 new->fa_next = *fapp;
682 *fapp = new; 701 rcu_assign_pointer(*fapp, new);
683 result = 1; 702 result = 1;
684 filp->f_flags |= FASYNC; 703 filp->f_flags |= FASYNC;
685 704
686out: 705out:
687 write_unlock_irq(&fasync_lock); 706 spin_unlock(&fasync_lock);
688 spin_unlock(&filp->f_lock); 707 spin_unlock(&filp->f_lock);
689 return result; 708 return result;
690} 709}
@@ -704,37 +723,41 @@ int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fap
704 723
705EXPORT_SYMBOL(fasync_helper); 724EXPORT_SYMBOL(fasync_helper);
706 725
707void __kill_fasync(struct fasync_struct *fa, int sig, int band) 726/*
727 * rcu_read_lock() is held
728 */
729static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band)
708{ 730{
709 while (fa) { 731 while (fa) {
710 struct fown_struct * fown; 732 struct fown_struct *fown;
711 if (fa->magic != FASYNC_MAGIC) { 733 if (fa->magic != FASYNC_MAGIC) {
712 printk(KERN_ERR "kill_fasync: bad magic number in " 734 printk(KERN_ERR "kill_fasync: bad magic number in "
713 "fasync_struct!\n"); 735 "fasync_struct!\n");
714 return; 736 return;
715 } 737 }
716 fown = &fa->fa_file->f_owner; 738 spin_lock(&fa->fa_lock);
717 /* Don't send SIGURG to processes which have not set a 739 if (fa->fa_file) {
718 queued signum: SIGURG has its own default signalling 740 fown = &fa->fa_file->f_owner;
719 mechanism. */ 741 /* Don't send SIGURG to processes which have not set a
720 if (!(sig == SIGURG && fown->signum == 0)) 742 queued signum: SIGURG has its own default signalling
721 send_sigio(fown, fa->fa_fd, band); 743 mechanism. */
722 fa = fa->fa_next; 744 if (!(sig == SIGURG && fown->signum == 0))
745 send_sigio(fown, fa->fa_fd, band);
746 }
747 spin_unlock(&fa->fa_lock);
748 fa = rcu_dereference(fa->fa_next);
723 } 749 }
724} 750}
725 751
726EXPORT_SYMBOL(__kill_fasync);
727
728void kill_fasync(struct fasync_struct **fp, int sig, int band) 752void kill_fasync(struct fasync_struct **fp, int sig, int band)
729{ 753{
730 /* First a quick test without locking: usually 754 /* First a quick test without locking: usually
731 * the list is empty. 755 * the list is empty.
732 */ 756 */
733 if (*fp) { 757 if (*fp) {
734 read_lock(&fasync_lock); 758 rcu_read_lock();
735 /* reread *fp after obtaining the lock */ 759 kill_fasync_rcu(rcu_dereference(*fp), sig, band);
736 __kill_fasync(*fp, sig, band); 760 rcu_read_unlock();
737 read_unlock(&fasync_lock);
738 } 761 }
739} 762}
740EXPORT_SYMBOL(kill_fasync); 763EXPORT_SYMBOL(kill_fasync);
diff --git a/fs/fifo.c b/fs/fifo.c
index f8f97b8b6d44..5d6606ffc2d2 100644
--- a/fs/fifo.c
+++ b/fs/fifo.c
@@ -10,7 +10,6 @@
10 */ 10 */
11 11
12#include <linux/mm.h> 12#include <linux/mm.h>
13#include <linux/slab.h>
14#include <linux/fs.h> 13#include <linux/fs.h>
15#include <linux/sched.h> 14#include <linux/sched.h>
16#include <linux/pipe_fs_i.h> 15#include <linux/pipe_fs_i.h>
diff --git a/fs/file_table.c b/fs/file_table.c
index 32d12b78bac8..5c7d10ead4ad 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -194,14 +194,6 @@ struct file *alloc_file(struct path *path, fmode_t mode,
194} 194}
195EXPORT_SYMBOL(alloc_file); 195EXPORT_SYMBOL(alloc_file);
196 196
197void fput(struct file *file)
198{
199 if (atomic_long_dec_and_test(&file->f_count))
200 __fput(file);
201}
202
203EXPORT_SYMBOL(fput);
204
205/** 197/**
206 * drop_file_write_access - give up ability to write to a file 198 * drop_file_write_access - give up ability to write to a file
207 * @file: the file to which we will stop writing 199 * @file: the file to which we will stop writing
@@ -227,10 +219,9 @@ void drop_file_write_access(struct file *file)
227} 219}
228EXPORT_SYMBOL_GPL(drop_file_write_access); 220EXPORT_SYMBOL_GPL(drop_file_write_access);
229 221
230/* __fput is called from task context when aio completion releases the last 222/* the real guts of fput() - releasing the last reference to file
231 * last use of a struct file *. Do not use otherwise.
232 */ 223 */
233void __fput(struct file *file) 224static void __fput(struct file *file)
234{ 225{
235 struct dentry *dentry = file->f_path.dentry; 226 struct dentry *dentry = file->f_path.dentry;
236 struct vfsmount *mnt = file->f_path.mnt; 227 struct vfsmount *mnt = file->f_path.mnt;
@@ -268,6 +259,14 @@ void __fput(struct file *file)
268 mntput(mnt); 259 mntput(mnt);
269} 260}
270 261
262void fput(struct file *file)
263{
264 if (atomic_long_dec_and_test(&file->f_count))
265 __fput(file);
266}
267
268EXPORT_SYMBOL(fput);
269
271struct file *fget(unsigned int fd) 270struct file *fget(unsigned int fd)
272{ 271{
273 struct file *file; 272 struct file *file;
diff --git a/fs/filesystems.c b/fs/filesystems.c
index a24c58e181db..68ba492d8eef 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -10,10 +10,10 @@
10#include <linux/fs.h> 10#include <linux/fs.h>
11#include <linux/proc_fs.h> 11#include <linux/proc_fs.h>
12#include <linux/seq_file.h> 12#include <linux/seq_file.h>
13#include <linux/slab.h>
14#include <linux/kmod.h> 13#include <linux/kmod.h>
15#include <linux/init.h> 14#include <linux/init.h>
16#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/slab.h>
17#include <asm/uaccess.h> 17#include <asm/uaccess.h>
18 18
19/* 19/*
diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c
index aee049cb9f84..0ec7bb2c95c6 100644
--- a/fs/freevxfs/vxfs_lookup.c
+++ b/fs/freevxfs/vxfs_lookup.c
@@ -57,6 +57,8 @@ const struct inode_operations vxfs_dir_inode_ops = {
57}; 57};
58 58
59const struct file_operations vxfs_dir_operations = { 59const struct file_operations vxfs_dir_operations = {
60 .llseek = generic_file_llseek,
61 .read = generic_read_dir,
60 .readdir = vxfs_readdir, 62 .readdir = vxfs_readdir,
61}; 63};
62 64
diff --git a/fs/freevxfs/vxfs_subr.c b/fs/freevxfs/vxfs_subr.c
index ed8f0b0dd880..1429f3ae1e86 100644
--- a/fs/freevxfs/vxfs_subr.c
+++ b/fs/freevxfs/vxfs_subr.c
@@ -33,7 +33,6 @@
33#include <linux/fs.h> 33#include <linux/fs.h>
34#include <linux/buffer_head.h> 34#include <linux/buffer_head.h>
35#include <linux/kernel.h> 35#include <linux/kernel.h>
36#include <linux/slab.h>
37#include <linux/pagemap.h> 36#include <linux/pagemap.h>
38 37
39#include "vxfs_extern.h" 38#include "vxfs_extern.h"
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 76fc4d594acb..ea8592b90696 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -16,6 +16,7 @@
16#include <linux/kernel.h> 16#include <linux/kernel.h>
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/spinlock.h> 18#include <linux/spinlock.h>
19#include <linux/slab.h>
19#include <linux/sched.h> 20#include <linux/sched.h>
20#include <linux/fs.h> 21#include <linux/fs.h>
21#include <linux/mm.h> 22#include <linux/mm.h>
@@ -41,9 +42,10 @@ struct wb_writeback_args {
41 long nr_pages; 42 long nr_pages;
42 struct super_block *sb; 43 struct super_block *sb;
43 enum writeback_sync_modes sync_mode; 44 enum writeback_sync_modes sync_mode;
44 int for_kupdate:1; 45 unsigned int for_kupdate:1;
45 int range_cyclic:1; 46 unsigned int range_cyclic:1;
46 int for_background:1; 47 unsigned int for_background:1;
48 unsigned int sb_pinned:1;
47}; 49};
48 50
49/* 51/*
@@ -191,7 +193,8 @@ static void bdi_wait_on_work_clear(struct bdi_work *work)
191} 193}
192 194
193static void bdi_alloc_queue_work(struct backing_dev_info *bdi, 195static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
194 struct wb_writeback_args *args) 196 struct wb_writeback_args *args,
197 int wait)
195{ 198{
196 struct bdi_work *work; 199 struct bdi_work *work;
197 200
@@ -203,6 +206,8 @@ static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
203 if (work) { 206 if (work) {
204 bdi_work_init(work, args); 207 bdi_work_init(work, args);
205 bdi_queue_work(bdi, work); 208 bdi_queue_work(bdi, work);
209 if (wait)
210 bdi_wait_on_work_clear(work);
206 } else { 211 } else {
207 struct bdi_writeback *wb = &bdi->wb; 212 struct bdi_writeback *wb = &bdi->wb;
208 213
@@ -229,6 +234,11 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi,
229 .sync_mode = WB_SYNC_ALL, 234 .sync_mode = WB_SYNC_ALL,
230 .nr_pages = LONG_MAX, 235 .nr_pages = LONG_MAX,
231 .range_cyclic = 0, 236 .range_cyclic = 0,
237 /*
238 * Setting sb_pinned is not necessary for WB_SYNC_ALL, but
239 * lets make it explicitly clear.
240 */
241 .sb_pinned = 1,
232 }; 242 };
233 struct bdi_work work; 243 struct bdi_work work;
234 244
@@ -244,21 +254,23 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi,
244 * @bdi: the backing device to write from 254 * @bdi: the backing device to write from
245 * @sb: write inodes from this super_block 255 * @sb: write inodes from this super_block
246 * @nr_pages: the number of pages to write 256 * @nr_pages: the number of pages to write
257 * @sb_locked: caller already holds sb umount sem.
247 * 258 *
248 * Description: 259 * Description:
249 * This does WB_SYNC_NONE opportunistic writeback. The IO is only 260 * This does WB_SYNC_NONE opportunistic writeback. The IO is only
250 * started when this function returns, we make no guarentees on 261 * started when this function returns, we make no guarentees on
251 * completion. Caller need not hold sb s_umount semaphore. 262 * completion. Caller specifies whether sb umount sem is held already or not.
252 * 263 *
253 */ 264 */
254void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb, 265void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
255 long nr_pages) 266 long nr_pages, int sb_locked)
256{ 267{
257 struct wb_writeback_args args = { 268 struct wb_writeback_args args = {
258 .sb = sb, 269 .sb = sb,
259 .sync_mode = WB_SYNC_NONE, 270 .sync_mode = WB_SYNC_NONE,
260 .nr_pages = nr_pages, 271 .nr_pages = nr_pages,
261 .range_cyclic = 1, 272 .range_cyclic = 1,
273 .sb_pinned = sb_locked,
262 }; 274 };
263 275
264 /* 276 /*
@@ -270,7 +282,7 @@ void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
270 args.for_background = 1; 282 args.for_background = 1;
271 } 283 }
272 284
273 bdi_alloc_queue_work(bdi, &args); 285 bdi_alloc_queue_work(bdi, &args, sb_locked);
274} 286}
275 287
276/* 288/*
@@ -397,11 +409,11 @@ static void inode_wait_for_writeback(struct inode *inode)
397 wait_queue_head_t *wqh; 409 wait_queue_head_t *wqh;
398 410
399 wqh = bit_waitqueue(&inode->i_state, __I_SYNC); 411 wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
400 do { 412 while (inode->i_state & I_SYNC) {
401 spin_unlock(&inode_lock); 413 spin_unlock(&inode_lock);
402 __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); 414 __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
403 spin_lock(&inode_lock); 415 spin_lock(&inode_lock);
404 } while (inode->i_state & I_SYNC); 416 }
405} 417}
406 418
407/* 419/*
@@ -451,11 +463,9 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
451 463
452 BUG_ON(inode->i_state & I_SYNC); 464 BUG_ON(inode->i_state & I_SYNC);
453 465
454 /* Set I_SYNC, reset I_DIRTY */ 466 /* Set I_SYNC, reset I_DIRTY_PAGES */
455 dirty = inode->i_state & I_DIRTY;
456 inode->i_state |= I_SYNC; 467 inode->i_state |= I_SYNC;
457 inode->i_state &= ~I_DIRTY; 468 inode->i_state &= ~I_DIRTY_PAGES;
458
459 spin_unlock(&inode_lock); 469 spin_unlock(&inode_lock);
460 470
461 ret = do_writepages(mapping, wbc); 471 ret = do_writepages(mapping, wbc);
@@ -471,6 +481,15 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
471 ret = err; 481 ret = err;
472 } 482 }
473 483
484 /*
485 * Some filesystems may redirty the inode during the writeback
486 * due to delalloc, clear dirty metadata flags right before
487 * write_inode()
488 */
489 spin_lock(&inode_lock);
490 dirty = inode->i_state & I_DIRTY;
491 inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC);
492 spin_unlock(&inode_lock);
474 /* Don't write the inode if only I_DIRTY_PAGES was set */ 493 /* Don't write the inode if only I_DIRTY_PAGES was set */
475 if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { 494 if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
476 int err = write_inode(inode, wbc); 495 int err = write_inode(inode, wbc);
@@ -553,108 +572,85 @@ select_queue:
553 return ret; 572 return ret;
554} 573}
555 574
556static void unpin_sb_for_writeback(struct super_block **psb) 575static void unpin_sb_for_writeback(struct super_block *sb)
557{ 576{
558 struct super_block *sb = *psb; 577 up_read(&sb->s_umount);
559 578 put_super(sb);
560 if (sb) {
561 up_read(&sb->s_umount);
562 put_super(sb);
563 *psb = NULL;
564 }
565} 579}
566 580
581enum sb_pin_state {
582 SB_PINNED,
583 SB_NOT_PINNED,
584 SB_PIN_FAILED
585};
586
567/* 587/*
568 * For WB_SYNC_NONE writeback, the caller does not have the sb pinned 588 * For WB_SYNC_NONE writeback, the caller does not have the sb pinned
569 * before calling writeback. So make sure that we do pin it, so it doesn't 589 * before calling writeback. So make sure that we do pin it, so it doesn't
570 * go away while we are writing inodes from it. 590 * go away while we are writing inodes from it.
571 *
572 * Returns 0 if the super was successfully pinned (or pinning wasn't needed),
573 * 1 if we failed.
574 */ 591 */
575static int pin_sb_for_writeback(struct writeback_control *wbc, 592static enum sb_pin_state pin_sb_for_writeback(struct writeback_control *wbc,
576 struct inode *inode, struct super_block **psb) 593 struct super_block *sb)
577{ 594{
578 struct super_block *sb = inode->i_sb;
579
580 /*
581 * If this sb is already pinned, nothing more to do. If not and
582 * *psb is non-NULL, unpin the old one first
583 */
584 if (sb == *psb)
585 return 0;
586 else if (*psb)
587 unpin_sb_for_writeback(psb);
588
589 /* 595 /*
590 * Caller must already hold the ref for this 596 * Caller must already hold the ref for this
591 */ 597 */
592 if (wbc->sync_mode == WB_SYNC_ALL) { 598 if (wbc->sync_mode == WB_SYNC_ALL || wbc->sb_pinned) {
593 WARN_ON(!rwsem_is_locked(&sb->s_umount)); 599 WARN_ON(!rwsem_is_locked(&sb->s_umount));
594 return 0; 600 return SB_NOT_PINNED;
595 } 601 }
596
597 spin_lock(&sb_lock); 602 spin_lock(&sb_lock);
598 sb->s_count++; 603 sb->s_count++;
599 if (down_read_trylock(&sb->s_umount)) { 604 if (down_read_trylock(&sb->s_umount)) {
600 if (sb->s_root) { 605 if (sb->s_root) {
601 spin_unlock(&sb_lock); 606 spin_unlock(&sb_lock);
602 goto pinned; 607 return SB_PINNED;
603 } 608 }
604 /* 609 /*
605 * umounted, drop rwsem again and fall through to failure 610 * umounted, drop rwsem again and fall through to failure
606 */ 611 */
607 up_read(&sb->s_umount); 612 up_read(&sb->s_umount);
608 } 613 }
609
610 sb->s_count--; 614 sb->s_count--;
611 spin_unlock(&sb_lock); 615 spin_unlock(&sb_lock);
612 return 1; 616 return SB_PIN_FAILED;
613pinned:
614 *psb = sb;
615 return 0;
616} 617}
617 618
618static void writeback_inodes_wb(struct bdi_writeback *wb, 619/*
619 struct writeback_control *wbc) 620 * Write a portion of b_io inodes which belong to @sb.
621 * If @wbc->sb != NULL, then find and write all such
622 * inodes. Otherwise write only ones which go sequentially
623 * in reverse order.
624 * Return 1, if the caller writeback routine should be
625 * interrupted. Otherwise return 0.
626 */
627static int writeback_sb_inodes(struct super_block *sb,
628 struct bdi_writeback *wb,
629 struct writeback_control *wbc)
620{ 630{
621 struct super_block *sb = wbc->sb, *pin_sb = NULL;
622 const unsigned long start = jiffies; /* livelock avoidance */
623
624 spin_lock(&inode_lock);
625
626 if (!wbc->for_kupdate || list_empty(&wb->b_io))
627 queue_io(wb, wbc->older_than_this);
628
629 while (!list_empty(&wb->b_io)) { 631 while (!list_empty(&wb->b_io)) {
630 struct inode *inode = list_entry(wb->b_io.prev,
631 struct inode, i_list);
632 long pages_skipped; 632 long pages_skipped;
633 633 struct inode *inode = list_entry(wb->b_io.prev,
634 /* 634 struct inode, i_list);
635 * super block given and doesn't match, skip this inode 635 if (wbc->sb && sb != inode->i_sb) {
636 */ 636 /* super block given and doesn't
637 if (sb && sb != inode->i_sb) { 637 match, skip this inode */
638 redirty_tail(inode); 638 redirty_tail(inode);
639 continue; 639 continue;
640 } 640 }
641 641 if (sb != inode->i_sb)
642 /* finish with this superblock */
643 return 0;
642 if (inode->i_state & (I_NEW | I_WILL_FREE)) { 644 if (inode->i_state & (I_NEW | I_WILL_FREE)) {
643 requeue_io(inode); 645 requeue_io(inode);
644 continue; 646 continue;
645 } 647 }
646
647 /* 648 /*
648 * Was this inode dirtied after sync_sb_inodes was called? 649 * Was this inode dirtied after sync_sb_inodes was called?
649 * This keeps sync from extra jobs and livelock. 650 * This keeps sync from extra jobs and livelock.
650 */ 651 */
651 if (inode_dirtied_after(inode, start)) 652 if (inode_dirtied_after(inode, wbc->wb_start))
652 break; 653 return 1;
653
654 if (pin_sb_for_writeback(wbc, inode, &pin_sb)) {
655 requeue_io(inode);
656 continue;
657 }
658 654
659 BUG_ON(inode->i_state & (I_FREEING | I_CLEAR)); 655 BUG_ON(inode->i_state & (I_FREEING | I_CLEAR));
660 __iget(inode); 656 __iget(inode);
@@ -673,14 +669,50 @@ static void writeback_inodes_wb(struct bdi_writeback *wb,
673 spin_lock(&inode_lock); 669 spin_lock(&inode_lock);
674 if (wbc->nr_to_write <= 0) { 670 if (wbc->nr_to_write <= 0) {
675 wbc->more_io = 1; 671 wbc->more_io = 1;
676 break; 672 return 1;
677 } 673 }
678 if (!list_empty(&wb->b_more_io)) 674 if (!list_empty(&wb->b_more_io))
679 wbc->more_io = 1; 675 wbc->more_io = 1;
680 } 676 }
677 /* b_io is empty */
678 return 1;
679}
680
681static void writeback_inodes_wb(struct bdi_writeback *wb,
682 struct writeback_control *wbc)
683{
684 int ret = 0;
685
686 wbc->wb_start = jiffies; /* livelock avoidance */
687 spin_lock(&inode_lock);
688 if (!wbc->for_kupdate || list_empty(&wb->b_io))
689 queue_io(wb, wbc->older_than_this);
681 690
682 unpin_sb_for_writeback(&pin_sb); 691 while (!list_empty(&wb->b_io)) {
692 struct inode *inode = list_entry(wb->b_io.prev,
693 struct inode, i_list);
694 struct super_block *sb = inode->i_sb;
695 enum sb_pin_state state;
683 696
697 if (wbc->sb && sb != wbc->sb) {
698 /* super block given and doesn't
699 match, skip this inode */
700 redirty_tail(inode);
701 continue;
702 }
703 state = pin_sb_for_writeback(wbc, sb);
704
705 if (state == SB_PIN_FAILED) {
706 requeue_io(inode);
707 continue;
708 }
709 ret = writeback_sb_inodes(sb, wb, wbc);
710
711 if (state == SB_PINNED)
712 unpin_sb_for_writeback(sb);
713 if (ret)
714 break;
715 }
684 spin_unlock(&inode_lock); 716 spin_unlock(&inode_lock);
685 /* Leave any unwritten inodes on b_io */ 717 /* Leave any unwritten inodes on b_io */
686} 718}
@@ -737,6 +769,7 @@ static long wb_writeback(struct bdi_writeback *wb,
737 .for_kupdate = args->for_kupdate, 769 .for_kupdate = args->for_kupdate,
738 .for_background = args->for_background, 770 .for_background = args->for_background,
739 .range_cyclic = args->range_cyclic, 771 .range_cyclic = args->range_cyclic,
772 .sb_pinned = args->sb_pinned,
740 }; 773 };
741 unsigned long oldest_jif; 774 unsigned long oldest_jif;
742 long wrote = 0; 775 long wrote = 0;
@@ -838,6 +871,12 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
838 unsigned long expired; 871 unsigned long expired;
839 long nr_pages; 872 long nr_pages;
840 873
874 /*
875 * When set to zero, disable periodic writeback
876 */
877 if (!dirty_writeback_interval)
878 return 0;
879
841 expired = wb->last_old_flush + 880 expired = wb->last_old_flush +
842 msecs_to_jiffies(dirty_writeback_interval * 10); 881 msecs_to_jiffies(dirty_writeback_interval * 10);
843 if (time_before(jiffies, expired)) 882 if (time_before(jiffies, expired))
@@ -873,6 +912,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
873 912
874 while ((work = get_next_work_item(bdi, wb)) != NULL) { 913 while ((work = get_next_work_item(bdi, wb)) != NULL) {
875 struct wb_writeback_args args = work->args; 914 struct wb_writeback_args args = work->args;
915 int post_clear;
876 916
877 /* 917 /*
878 * Override sync mode, in case we must wait for completion 918 * Override sync mode, in case we must wait for completion
@@ -880,11 +920,13 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
880 if (force_wait) 920 if (force_wait)
881 work->args.sync_mode = args.sync_mode = WB_SYNC_ALL; 921 work->args.sync_mode = args.sync_mode = WB_SYNC_ALL;
882 922
923 post_clear = WB_SYNC_ALL || args.sb_pinned;
924
883 /* 925 /*
884 * If this isn't a data integrity operation, just notify 926 * If this isn't a data integrity operation, just notify
885 * that we have seen this work and we are now starting it. 927 * that we have seen this work and we are now starting it.
886 */ 928 */
887 if (args.sync_mode == WB_SYNC_NONE) 929 if (!post_clear)
888 wb_clear_pending(wb, work); 930 wb_clear_pending(wb, work);
889 931
890 wrote += wb_writeback(wb, &args); 932 wrote += wb_writeback(wb, &args);
@@ -893,7 +935,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
893 * This is a data integrity writeback, so only do the 935 * This is a data integrity writeback, so only do the
894 * notification when we have completed the work. 936 * notification when we have completed the work.
895 */ 937 */
896 if (args.sync_mode == WB_SYNC_ALL) 938 if (post_clear)
897 wb_clear_pending(wb, work); 939 wb_clear_pending(wb, work);
898 } 940 }
899 941
@@ -933,8 +975,17 @@ int bdi_writeback_task(struct bdi_writeback *wb)
933 break; 975 break;
934 } 976 }
935 977
936 wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10); 978 if (dirty_writeback_interval) {
937 schedule_timeout_interruptible(wait_jiffies); 979 wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
980 schedule_timeout_interruptible(wait_jiffies);
981 } else {
982 set_current_state(TASK_INTERRUPTIBLE);
983 if (list_empty_careful(&wb->bdi->work_list) &&
984 !kthread_should_stop())
985 schedule();
986 __set_current_state(TASK_RUNNING);
987 }
988
938 try_to_freeze(); 989 try_to_freeze();
939 } 990 }
940 991
@@ -960,7 +1011,7 @@ static void bdi_writeback_all(struct super_block *sb, long nr_pages)
960 if (!bdi_has_dirty_io(bdi)) 1011 if (!bdi_has_dirty_io(bdi))
961 continue; 1012 continue;
962 1013
963 bdi_alloc_queue_work(bdi, &args); 1014 bdi_alloc_queue_work(bdi, &args, 0);
964 } 1015 }
965 1016
966 rcu_read_unlock(); 1017 rcu_read_unlock();
@@ -1169,6 +1220,18 @@ static void wait_sb_inodes(struct super_block *sb)
1169 iput(old_inode); 1220 iput(old_inode);
1170} 1221}
1171 1222
1223static void __writeback_inodes_sb(struct super_block *sb, int sb_locked)
1224{
1225 unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
1226 unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
1227 long nr_to_write;
1228
1229 nr_to_write = nr_dirty + nr_unstable +
1230 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
1231
1232 bdi_start_writeback(sb->s_bdi, sb, nr_to_write, sb_locked);
1233}
1234
1172/** 1235/**
1173 * writeback_inodes_sb - writeback dirty inodes from given super_block 1236 * writeback_inodes_sb - writeback dirty inodes from given super_block
1174 * @sb: the superblock 1237 * @sb: the superblock
@@ -1180,18 +1243,23 @@ static void wait_sb_inodes(struct super_block *sb)
1180 */ 1243 */
1181void writeback_inodes_sb(struct super_block *sb) 1244void writeback_inodes_sb(struct super_block *sb)
1182{ 1245{
1183 unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); 1246 __writeback_inodes_sb(sb, 0);
1184 unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
1185 long nr_to_write;
1186
1187 nr_to_write = nr_dirty + nr_unstable +
1188 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
1189
1190 bdi_start_writeback(sb->s_bdi, sb, nr_to_write);
1191} 1247}
1192EXPORT_SYMBOL(writeback_inodes_sb); 1248EXPORT_SYMBOL(writeback_inodes_sb);
1193 1249
1194/** 1250/**
1251 * writeback_inodes_sb_locked - writeback dirty inodes from given super_block
1252 * @sb: the superblock
1253 *
1254 * Like writeback_inodes_sb(), except the caller already holds the
1255 * sb umount sem.
1256 */
1257void writeback_inodes_sb_locked(struct super_block *sb)
1258{
1259 __writeback_inodes_sb(sb, 1);
1260}
1261
1262/**
1195 * writeback_inodes_sb_if_idle - start writeback if none underway 1263 * writeback_inodes_sb_if_idle - start writeback if none underway
1196 * @sb: the superblock 1264 * @sb: the superblock
1197 * 1265 *
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index 3221a0c7944e..4a8eb31c5338 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c
@@ -12,6 +12,7 @@
12#define FSCACHE_DEBUG_LEVEL COOKIE 12#define FSCACHE_DEBUG_LEVEL COOKIE
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/seq_file.h> 14#include <linux/seq_file.h>
15#include <linux/slab.h>
15#include <linux/key.h> 16#include <linux/key.h>
16#include <keys/user-type.h> 17#include <keys/user-type.h>
17#include "internal.h" 18#include "internal.h"
@@ -102,7 +103,7 @@ static struct fscache_object *fscache_objlist_lookup(loff_t *_pos)
102 /* banners (can't represent line 0 by pos 0 as that would involve 103 /* banners (can't represent line 0 by pos 0 as that would involve
103 * returning a NULL pointer) */ 104 * returning a NULL pointer) */
104 if (pos == 0) 105 if (pos == 0)
105 return (struct fscache_object *) ++(*_pos); 106 return (struct fscache_object *)(long)++(*_pos);
106 if (pos < 3) 107 if (pos < 3)
107 return (struct fscache_object *)pos; 108 return (struct fscache_object *)pos;
108 109
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index e513ac599c8e..0b589a9b4ffc 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -53,7 +53,7 @@ const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5] = {
53static void fscache_object_slow_work_put_ref(struct slow_work *); 53static void fscache_object_slow_work_put_ref(struct slow_work *);
54static int fscache_object_slow_work_get_ref(struct slow_work *); 54static int fscache_object_slow_work_get_ref(struct slow_work *);
55static void fscache_object_slow_work_execute(struct slow_work *); 55static void fscache_object_slow_work_execute(struct slow_work *);
56#ifdef CONFIG_SLOW_WORK_PROC 56#ifdef CONFIG_SLOW_WORK_DEBUG
57static void fscache_object_slow_work_desc(struct slow_work *, struct seq_file *); 57static void fscache_object_slow_work_desc(struct slow_work *, struct seq_file *);
58#endif 58#endif
59static void fscache_initialise_object(struct fscache_object *); 59static void fscache_initialise_object(struct fscache_object *);
@@ -69,7 +69,7 @@ const struct slow_work_ops fscache_object_slow_work_ops = {
69 .get_ref = fscache_object_slow_work_get_ref, 69 .get_ref = fscache_object_slow_work_get_ref,
70 .put_ref = fscache_object_slow_work_put_ref, 70 .put_ref = fscache_object_slow_work_put_ref,
71 .execute = fscache_object_slow_work_execute, 71 .execute = fscache_object_slow_work_execute,
72#ifdef CONFIG_SLOW_WORK_PROC 72#ifdef CONFIG_SLOW_WORK_DEBUG
73 .desc = fscache_object_slow_work_desc, 73 .desc = fscache_object_slow_work_desc,
74#endif 74#endif
75}; 75};
@@ -364,7 +364,7 @@ static void fscache_object_slow_work_execute(struct slow_work *work)
364/* 364/*
365 * describe an object for slow-work debugging 365 * describe an object for slow-work debugging
366 */ 366 */
367#ifdef CONFIG_SLOW_WORK_PROC 367#ifdef CONFIG_SLOW_WORK_DEBUG
368static void fscache_object_slow_work_desc(struct slow_work *work, 368static void fscache_object_slow_work_desc(struct slow_work *work,
369 struct seq_file *m) 369 struct seq_file *m)
370{ 370{
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index 313e79a14266..f17cecafae44 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -14,6 +14,7 @@
14#define FSCACHE_DEBUG_LEVEL OPERATION 14#define FSCACHE_DEBUG_LEVEL OPERATION
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/seq_file.h> 16#include <linux/seq_file.h>
17#include <linux/slab.h>
17#include "internal.h" 18#include "internal.h"
18 19
19atomic_t fscache_op_debug_id; 20atomic_t fscache_op_debug_id;
@@ -500,7 +501,7 @@ static void fscache_op_execute(struct slow_work *work)
500/* 501/*
501 * describe an operation for slow-work debugging 502 * describe an operation for slow-work debugging
502 */ 503 */
503#ifdef CONFIG_SLOW_WORK_PROC 504#ifdef CONFIG_SLOW_WORK_DEBUG
504static void fscache_op_desc(struct slow_work *work, struct seq_file *m) 505static void fscache_op_desc(struct slow_work *work, struct seq_file *m)
505{ 506{
506 struct fscache_operation *op = 507 struct fscache_operation *op =
@@ -517,7 +518,7 @@ const struct slow_work_ops fscache_op_slow_work_ops = {
517 .get_ref = fscache_op_get_ref, 518 .get_ref = fscache_op_get_ref,
518 .put_ref = fscache_op_put_ref, 519 .put_ref = fscache_op_put_ref,
519 .execute = fscache_op_execute, 520 .execute = fscache_op_execute,
520#ifdef CONFIG_SLOW_WORK_PROC 521#ifdef CONFIG_SLOW_WORK_DEBUG
521 .desc = fscache_op_desc, 522 .desc = fscache_op_desc,
522#endif 523#endif
523}; 524};
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index c598ea4c4e7d..47aefd376e54 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -14,6 +14,7 @@
14#include <linux/fscache-cache.h> 14#include <linux/fscache-cache.h>
15#include <linux/buffer_head.h> 15#include <linux/buffer_head.h>
16#include <linux/pagevec.h> 16#include <linux/pagevec.h>
17#include <linux/slab.h>
17#include "internal.h" 18#include "internal.h"
18 19
19/* 20/*
@@ -881,6 +882,7 @@ submit_failed:
881 goto nobufs; 882 goto nobufs;
882 883
883nobufs_unlock_obj: 884nobufs_unlock_obj:
885 spin_unlock(&cookie->stores_lock);
884 spin_unlock(&object->lock); 886 spin_unlock(&object->lock);
885nobufs: 887nobufs:
886 spin_unlock(&cookie->lock); 888 spin_unlock(&cookie->lock);
diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c
index 46435f3aae68..4765190d537f 100644
--- a/fs/fscache/stats.c
+++ b/fs/fscache/stats.c
@@ -165,8 +165,8 @@ static int fscache_stats_show(struct seq_file *m, void *v)
165 atomic_read(&fscache_n_object_lookups), 165 atomic_read(&fscache_n_object_lookups),
166 atomic_read(&fscache_n_object_lookups_negative), 166 atomic_read(&fscache_n_object_lookups_negative),
167 atomic_read(&fscache_n_object_lookups_positive), 167 atomic_read(&fscache_n_object_lookups_positive),
168 atomic_read(&fscache_n_object_lookups_timed_out), 168 atomic_read(&fscache_n_object_created),
169 atomic_read(&fscache_n_object_created)); 169 atomic_read(&fscache_n_object_lookups_timed_out));
170 170
171 seq_printf(m, "Updates: n=%u nul=%u run=%u\n", 171 seq_printf(m, "Updates: n=%u nul=%u run=%u\n",
172 atomic_read(&fscache_n_updates), 172 atomic_read(&fscache_n_updates),
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index de792dcf3274..e1f8171278bd 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -44,6 +44,7 @@
44#include <linux/magic.h> 44#include <linux/magic.h>
45#include <linux/miscdevice.h> 45#include <linux/miscdevice.h>
46#include <linux/mutex.h> 46#include <linux/mutex.h>
47#include <linux/slab.h>
47#include <linux/spinlock.h> 48#include <linux/spinlock.h>
48#include <linux/stat.h> 49#include <linux/stat.h>
49 50
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index eb7e9423691f..9424796d6634 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -16,8 +16,12 @@
16#include <linux/pagemap.h> 16#include <linux/pagemap.h>
17#include <linux/file.h> 17#include <linux/file.h>
18#include <linux/slab.h> 18#include <linux/slab.h>
19#include <linux/pipe_fs_i.h>
20#include <linux/swap.h>
21#include <linux/splice.h>
19 22
20MODULE_ALIAS_MISCDEV(FUSE_MINOR); 23MODULE_ALIAS_MISCDEV(FUSE_MINOR);
24MODULE_ALIAS("devname:fuse");
21 25
22static struct kmem_cache *fuse_req_cachep; 26static struct kmem_cache *fuse_req_cachep;
23 27
@@ -498,6 +502,9 @@ struct fuse_copy_state {
498 int write; 502 int write;
499 struct fuse_req *req; 503 struct fuse_req *req;
500 const struct iovec *iov; 504 const struct iovec *iov;
505 struct pipe_buffer *pipebufs;
506 struct pipe_buffer *currbuf;
507 struct pipe_inode_info *pipe;
501 unsigned long nr_segs; 508 unsigned long nr_segs;
502 unsigned long seglen; 509 unsigned long seglen;
503 unsigned long addr; 510 unsigned long addr;
@@ -505,16 +512,16 @@ struct fuse_copy_state {
505 void *mapaddr; 512 void *mapaddr;
506 void *buf; 513 void *buf;
507 unsigned len; 514 unsigned len;
515 unsigned move_pages:1;
508}; 516};
509 517
510static void fuse_copy_init(struct fuse_copy_state *cs, struct fuse_conn *fc, 518static void fuse_copy_init(struct fuse_copy_state *cs, struct fuse_conn *fc,
511 int write, struct fuse_req *req, 519 int write,
512 const struct iovec *iov, unsigned long nr_segs) 520 const struct iovec *iov, unsigned long nr_segs)
513{ 521{
514 memset(cs, 0, sizeof(*cs)); 522 memset(cs, 0, sizeof(*cs));
515 cs->fc = fc; 523 cs->fc = fc;
516 cs->write = write; 524 cs->write = write;
517 cs->req = req;
518 cs->iov = iov; 525 cs->iov = iov;
519 cs->nr_segs = nr_segs; 526 cs->nr_segs = nr_segs;
520} 527}
@@ -522,7 +529,18 @@ static void fuse_copy_init(struct fuse_copy_state *cs, struct fuse_conn *fc,
522/* Unmap and put previous page of userspace buffer */ 529/* Unmap and put previous page of userspace buffer */
523static void fuse_copy_finish(struct fuse_copy_state *cs) 530static void fuse_copy_finish(struct fuse_copy_state *cs)
524{ 531{
525 if (cs->mapaddr) { 532 if (cs->currbuf) {
533 struct pipe_buffer *buf = cs->currbuf;
534
535 if (!cs->write) {
536 buf->ops->unmap(cs->pipe, buf, cs->mapaddr);
537 } else {
538 kunmap_atomic(cs->mapaddr, KM_USER0);
539 buf->len = PAGE_SIZE - cs->len;
540 }
541 cs->currbuf = NULL;
542 cs->mapaddr = NULL;
543 } else if (cs->mapaddr) {
526 kunmap_atomic(cs->mapaddr, KM_USER0); 544 kunmap_atomic(cs->mapaddr, KM_USER0);
527 if (cs->write) { 545 if (cs->write) {
528 flush_dcache_page(cs->pg); 546 flush_dcache_page(cs->pg);
@@ -544,26 +562,61 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
544 562
545 unlock_request(cs->fc, cs->req); 563 unlock_request(cs->fc, cs->req);
546 fuse_copy_finish(cs); 564 fuse_copy_finish(cs);
547 if (!cs->seglen) { 565 if (cs->pipebufs) {
548 BUG_ON(!cs->nr_segs); 566 struct pipe_buffer *buf = cs->pipebufs;
549 cs->seglen = cs->iov[0].iov_len; 567
550 cs->addr = (unsigned long) cs->iov[0].iov_base; 568 if (!cs->write) {
551 cs->iov++; 569 err = buf->ops->confirm(cs->pipe, buf);
552 cs->nr_segs--; 570 if (err)
571 return err;
572
573 BUG_ON(!cs->nr_segs);
574 cs->currbuf = buf;
575 cs->mapaddr = buf->ops->map(cs->pipe, buf, 1);
576 cs->len = buf->len;
577 cs->buf = cs->mapaddr + buf->offset;
578 cs->pipebufs++;
579 cs->nr_segs--;
580 } else {
581 struct page *page;
582
583 if (cs->nr_segs == cs->pipe->buffers)
584 return -EIO;
585
586 page = alloc_page(GFP_HIGHUSER);
587 if (!page)
588 return -ENOMEM;
589
590 buf->page = page;
591 buf->offset = 0;
592 buf->len = 0;
593
594 cs->currbuf = buf;
595 cs->mapaddr = kmap_atomic(page, KM_USER0);
596 cs->buf = cs->mapaddr;
597 cs->len = PAGE_SIZE;
598 cs->pipebufs++;
599 cs->nr_segs++;
600 }
601 } else {
602 if (!cs->seglen) {
603 BUG_ON(!cs->nr_segs);
604 cs->seglen = cs->iov[0].iov_len;
605 cs->addr = (unsigned long) cs->iov[0].iov_base;
606 cs->iov++;
607 cs->nr_segs--;
608 }
609 err = get_user_pages_fast(cs->addr, 1, cs->write, &cs->pg);
610 if (err < 0)
611 return err;
612 BUG_ON(err != 1);
613 offset = cs->addr % PAGE_SIZE;
614 cs->mapaddr = kmap_atomic(cs->pg, KM_USER0);
615 cs->buf = cs->mapaddr + offset;
616 cs->len = min(PAGE_SIZE - offset, cs->seglen);
617 cs->seglen -= cs->len;
618 cs->addr += cs->len;
553 } 619 }
554 down_read(&current->mm->mmap_sem);
555 err = get_user_pages(current, current->mm, cs->addr, 1, cs->write, 0,
556 &cs->pg, NULL);
557 up_read(&current->mm->mmap_sem);
558 if (err < 0)
559 return err;
560 BUG_ON(err != 1);
561 offset = cs->addr % PAGE_SIZE;
562 cs->mapaddr = kmap_atomic(cs->pg, KM_USER0);
563 cs->buf = cs->mapaddr + offset;
564 cs->len = min(PAGE_SIZE - offset, cs->seglen);
565 cs->seglen -= cs->len;
566 cs->addr += cs->len;
567 620
568 return lock_request(cs->fc, cs->req); 621 return lock_request(cs->fc, cs->req);
569} 622}
@@ -585,23 +638,178 @@ static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size)
585 return ncpy; 638 return ncpy;
586} 639}
587 640
641static int fuse_check_page(struct page *page)
642{
643 if (page_mapcount(page) ||
644 page->mapping != NULL ||
645 page_count(page) != 1 ||
646 (page->flags & PAGE_FLAGS_CHECK_AT_PREP &
647 ~(1 << PG_locked |
648 1 << PG_referenced |
649 1 << PG_uptodate |
650 1 << PG_lru |
651 1 << PG_active |
652 1 << PG_reclaim))) {
653 printk(KERN_WARNING "fuse: trying to steal weird page\n");
654 printk(KERN_WARNING " page=%p index=%li flags=%08lx, count=%i, mapcount=%i, mapping=%p\n", page, page->index, page->flags, page_count(page), page_mapcount(page), page->mapping);
655 return 1;
656 }
657 return 0;
658}
659
660static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
661{
662 int err;
663 struct page *oldpage = *pagep;
664 struct page *newpage;
665 struct pipe_buffer *buf = cs->pipebufs;
666 struct address_space *mapping;
667 pgoff_t index;
668
669 unlock_request(cs->fc, cs->req);
670 fuse_copy_finish(cs);
671
672 err = buf->ops->confirm(cs->pipe, buf);
673 if (err)
674 return err;
675
676 BUG_ON(!cs->nr_segs);
677 cs->currbuf = buf;
678 cs->len = buf->len;
679 cs->pipebufs++;
680 cs->nr_segs--;
681
682 if (cs->len != PAGE_SIZE)
683 goto out_fallback;
684
685 if (buf->ops->steal(cs->pipe, buf) != 0)
686 goto out_fallback;
687
688 newpage = buf->page;
689
690 if (WARN_ON(!PageUptodate(newpage)))
691 return -EIO;
692
693 ClearPageMappedToDisk(newpage);
694
695 if (fuse_check_page(newpage) != 0)
696 goto out_fallback_unlock;
697
698 mapping = oldpage->mapping;
699 index = oldpage->index;
700
701 /*
702 * This is a new and locked page, it shouldn't be mapped or
703 * have any special flags on it
704 */
705 if (WARN_ON(page_mapped(oldpage)))
706 goto out_fallback_unlock;
707 if (WARN_ON(page_has_private(oldpage)))
708 goto out_fallback_unlock;
709 if (WARN_ON(PageDirty(oldpage) || PageWriteback(oldpage)))
710 goto out_fallback_unlock;
711 if (WARN_ON(PageMlocked(oldpage)))
712 goto out_fallback_unlock;
713
714 remove_from_page_cache(oldpage);
715 page_cache_release(oldpage);
716
717 err = add_to_page_cache_locked(newpage, mapping, index, GFP_KERNEL);
718 if (err) {
719 printk(KERN_WARNING "fuse_try_move_page: failed to add page");
720 goto out_fallback_unlock;
721 }
722 page_cache_get(newpage);
723
724 if (!(buf->flags & PIPE_BUF_FLAG_LRU))
725 lru_cache_add_file(newpage);
726
727 err = 0;
728 spin_lock(&cs->fc->lock);
729 if (cs->req->aborted)
730 err = -ENOENT;
731 else
732 *pagep = newpage;
733 spin_unlock(&cs->fc->lock);
734
735 if (err) {
736 unlock_page(newpage);
737 page_cache_release(newpage);
738 return err;
739 }
740
741 unlock_page(oldpage);
742 page_cache_release(oldpage);
743 cs->len = 0;
744
745 return 0;
746
747out_fallback_unlock:
748 unlock_page(newpage);
749out_fallback:
750 cs->mapaddr = buf->ops->map(cs->pipe, buf, 1);
751 cs->buf = cs->mapaddr + buf->offset;
752
753 err = lock_request(cs->fc, cs->req);
754 if (err)
755 return err;
756
757 return 1;
758}
759
760static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page,
761 unsigned offset, unsigned count)
762{
763 struct pipe_buffer *buf;
764
765 if (cs->nr_segs == cs->pipe->buffers)
766 return -EIO;
767
768 unlock_request(cs->fc, cs->req);
769 fuse_copy_finish(cs);
770
771 buf = cs->pipebufs;
772 page_cache_get(page);
773 buf->page = page;
774 buf->offset = offset;
775 buf->len = count;
776
777 cs->pipebufs++;
778 cs->nr_segs++;
779 cs->len = 0;
780
781 return 0;
782}
783
588/* 784/*
589 * Copy a page in the request to/from the userspace buffer. Must be 785 * Copy a page in the request to/from the userspace buffer. Must be
590 * done atomically 786 * done atomically
591 */ 787 */
592static int fuse_copy_page(struct fuse_copy_state *cs, struct page *page, 788static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,
593 unsigned offset, unsigned count, int zeroing) 789 unsigned offset, unsigned count, int zeroing)
594{ 790{
791 int err;
792 struct page *page = *pagep;
793
595 if (page && zeroing && count < PAGE_SIZE) { 794 if (page && zeroing && count < PAGE_SIZE) {
596 void *mapaddr = kmap_atomic(page, KM_USER1); 795 void *mapaddr = kmap_atomic(page, KM_USER1);
597 memset(mapaddr, 0, PAGE_SIZE); 796 memset(mapaddr, 0, PAGE_SIZE);
598 kunmap_atomic(mapaddr, KM_USER1); 797 kunmap_atomic(mapaddr, KM_USER1);
599 } 798 }
600 while (count) { 799 while (count) {
601 if (!cs->len) { 800 if (cs->write && cs->pipebufs && page) {
602 int err = fuse_copy_fill(cs); 801 return fuse_ref_page(cs, page, offset, count);
603 if (err) 802 } else if (!cs->len) {
604 return err; 803 if (cs->move_pages && page &&
804 offset == 0 && count == PAGE_SIZE) {
805 err = fuse_try_move_page(cs, pagep);
806 if (err <= 0)
807 return err;
808 } else {
809 err = fuse_copy_fill(cs);
810 if (err)
811 return err;
812 }
605 } 813 }
606 if (page) { 814 if (page) {
607 void *mapaddr = kmap_atomic(page, KM_USER1); 815 void *mapaddr = kmap_atomic(page, KM_USER1);
@@ -626,8 +834,10 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes,
626 unsigned count = min(nbytes, (unsigned) PAGE_SIZE - offset); 834 unsigned count = min(nbytes, (unsigned) PAGE_SIZE - offset);
627 835
628 for (i = 0; i < req->num_pages && (nbytes || zeroing); i++) { 836 for (i = 0; i < req->num_pages && (nbytes || zeroing); i++) {
629 struct page *page = req->pages[i]; 837 int err;
630 int err = fuse_copy_page(cs, page, offset, count, zeroing); 838
839 err = fuse_copy_page(cs, &req->pages[i], offset, count,
840 zeroing);
631 if (err) 841 if (err)
632 return err; 842 return err;
633 843
@@ -704,11 +914,10 @@ __acquires(&fc->lock)
704 * 914 *
705 * Called with fc->lock held, releases it 915 * Called with fc->lock held, releases it
706 */ 916 */
707static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_req *req, 917static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_copy_state *cs,
708 const struct iovec *iov, unsigned long nr_segs) 918 size_t nbytes, struct fuse_req *req)
709__releases(&fc->lock) 919__releases(&fc->lock)
710{ 920{
711 struct fuse_copy_state cs;
712 struct fuse_in_header ih; 921 struct fuse_in_header ih;
713 struct fuse_interrupt_in arg; 922 struct fuse_interrupt_in arg;
714 unsigned reqsize = sizeof(ih) + sizeof(arg); 923 unsigned reqsize = sizeof(ih) + sizeof(arg);
@@ -724,14 +933,13 @@ __releases(&fc->lock)
724 arg.unique = req->in.h.unique; 933 arg.unique = req->in.h.unique;
725 934
726 spin_unlock(&fc->lock); 935 spin_unlock(&fc->lock);
727 if (iov_length(iov, nr_segs) < reqsize) 936 if (nbytes < reqsize)
728 return -EINVAL; 937 return -EINVAL;
729 938
730 fuse_copy_init(&cs, fc, 1, NULL, iov, nr_segs); 939 err = fuse_copy_one(cs, &ih, sizeof(ih));
731 err = fuse_copy_one(&cs, &ih, sizeof(ih));
732 if (!err) 940 if (!err)
733 err = fuse_copy_one(&cs, &arg, sizeof(arg)); 941 err = fuse_copy_one(cs, &arg, sizeof(arg));
734 fuse_copy_finish(&cs); 942 fuse_copy_finish(cs);
735 943
736 return err ? err : reqsize; 944 return err ? err : reqsize;
737} 945}
@@ -745,18 +953,13 @@ __releases(&fc->lock)
745 * request_end(). Otherwise add it to the processing list, and set 953 * request_end(). Otherwise add it to the processing list, and set
746 * the 'sent' flag. 954 * the 'sent' flag.
747 */ 955 */
748static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov, 956static ssize_t fuse_dev_do_read(struct fuse_conn *fc, struct file *file,
749 unsigned long nr_segs, loff_t pos) 957 struct fuse_copy_state *cs, size_t nbytes)
750{ 958{
751 int err; 959 int err;
752 struct fuse_req *req; 960 struct fuse_req *req;
753 struct fuse_in *in; 961 struct fuse_in *in;
754 struct fuse_copy_state cs;
755 unsigned reqsize; 962 unsigned reqsize;
756 struct file *file = iocb->ki_filp;
757 struct fuse_conn *fc = fuse_get_conn(file);
758 if (!fc)
759 return -EPERM;
760 963
761 restart: 964 restart:
762 spin_lock(&fc->lock); 965 spin_lock(&fc->lock);
@@ -776,7 +979,7 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
776 if (!list_empty(&fc->interrupts)) { 979 if (!list_empty(&fc->interrupts)) {
777 req = list_entry(fc->interrupts.next, struct fuse_req, 980 req = list_entry(fc->interrupts.next, struct fuse_req,
778 intr_entry); 981 intr_entry);
779 return fuse_read_interrupt(fc, req, iov, nr_segs); 982 return fuse_read_interrupt(fc, cs, nbytes, req);
780 } 983 }
781 984
782 req = list_entry(fc->pending.next, struct fuse_req, list); 985 req = list_entry(fc->pending.next, struct fuse_req, list);
@@ -786,7 +989,7 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
786 in = &req->in; 989 in = &req->in;
787 reqsize = in->h.len; 990 reqsize = in->h.len;
788 /* If request is too large, reply with an error and restart the read */ 991 /* If request is too large, reply with an error and restart the read */
789 if (iov_length(iov, nr_segs) < reqsize) { 992 if (nbytes < reqsize) {
790 req->out.h.error = -EIO; 993 req->out.h.error = -EIO;
791 /* SETXATTR is special, since it may contain too large data */ 994 /* SETXATTR is special, since it may contain too large data */
792 if (in->h.opcode == FUSE_SETXATTR) 995 if (in->h.opcode == FUSE_SETXATTR)
@@ -795,12 +998,12 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
795 goto restart; 998 goto restart;
796 } 999 }
797 spin_unlock(&fc->lock); 1000 spin_unlock(&fc->lock);
798 fuse_copy_init(&cs, fc, 1, req, iov, nr_segs); 1001 cs->req = req;
799 err = fuse_copy_one(&cs, &in->h, sizeof(in->h)); 1002 err = fuse_copy_one(cs, &in->h, sizeof(in->h));
800 if (!err) 1003 if (!err)
801 err = fuse_copy_args(&cs, in->numargs, in->argpages, 1004 err = fuse_copy_args(cs, in->numargs, in->argpages,
802 (struct fuse_arg *) in->args, 0); 1005 (struct fuse_arg *) in->args, 0);
803 fuse_copy_finish(&cs); 1006 fuse_copy_finish(cs);
804 spin_lock(&fc->lock); 1007 spin_lock(&fc->lock);
805 req->locked = 0; 1008 req->locked = 0;
806 if (req->aborted) { 1009 if (req->aborted) {
@@ -828,6 +1031,110 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
828 return err; 1031 return err;
829} 1032}
830 1033
1034static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
1035 unsigned long nr_segs, loff_t pos)
1036{
1037 struct fuse_copy_state cs;
1038 struct file *file = iocb->ki_filp;
1039 struct fuse_conn *fc = fuse_get_conn(file);
1040 if (!fc)
1041 return -EPERM;
1042
1043 fuse_copy_init(&cs, fc, 1, iov, nr_segs);
1044
1045 return fuse_dev_do_read(fc, file, &cs, iov_length(iov, nr_segs));
1046}
1047
1048static int fuse_dev_pipe_buf_steal(struct pipe_inode_info *pipe,
1049 struct pipe_buffer *buf)
1050{
1051 return 1;
1052}
1053
1054static const struct pipe_buf_operations fuse_dev_pipe_buf_ops = {
1055 .can_merge = 0,
1056 .map = generic_pipe_buf_map,
1057 .unmap = generic_pipe_buf_unmap,
1058 .confirm = generic_pipe_buf_confirm,
1059 .release = generic_pipe_buf_release,
1060 .steal = fuse_dev_pipe_buf_steal,
1061 .get = generic_pipe_buf_get,
1062};
1063
1064static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
1065 struct pipe_inode_info *pipe,
1066 size_t len, unsigned int flags)
1067{
1068 int ret;
1069 int page_nr = 0;
1070 int do_wakeup = 0;
1071 struct pipe_buffer *bufs;
1072 struct fuse_copy_state cs;
1073 struct fuse_conn *fc = fuse_get_conn(in);
1074 if (!fc)
1075 return -EPERM;
1076
1077 bufs = kmalloc(pipe->buffers * sizeof (struct pipe_buffer), GFP_KERNEL);
1078 if (!bufs)
1079 return -ENOMEM;
1080
1081 fuse_copy_init(&cs, fc, 1, NULL, 0);
1082 cs.pipebufs = bufs;
1083 cs.pipe = pipe;
1084 ret = fuse_dev_do_read(fc, in, &cs, len);
1085 if (ret < 0)
1086 goto out;
1087
1088 ret = 0;
1089 pipe_lock(pipe);
1090
1091 if (!pipe->readers) {
1092 send_sig(SIGPIPE, current, 0);
1093 if (!ret)
1094 ret = -EPIPE;
1095 goto out_unlock;
1096 }
1097
1098 if (pipe->nrbufs + cs.nr_segs > pipe->buffers) {
1099 ret = -EIO;
1100 goto out_unlock;
1101 }
1102
1103 while (page_nr < cs.nr_segs) {
1104 int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
1105 struct pipe_buffer *buf = pipe->bufs + newbuf;
1106
1107 buf->page = bufs[page_nr].page;
1108 buf->offset = bufs[page_nr].offset;
1109 buf->len = bufs[page_nr].len;
1110 buf->ops = &fuse_dev_pipe_buf_ops;
1111
1112 pipe->nrbufs++;
1113 page_nr++;
1114 ret += buf->len;
1115
1116 if (pipe->inode)
1117 do_wakeup = 1;
1118 }
1119
1120out_unlock:
1121 pipe_unlock(pipe);
1122
1123 if (do_wakeup) {
1124 smp_mb();
1125 if (waitqueue_active(&pipe->wait))
1126 wake_up_interruptible(&pipe->wait);
1127 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
1128 }
1129
1130out:
1131 for (; page_nr < cs.nr_segs; page_nr++)
1132 page_cache_release(bufs[page_nr].page);
1133
1134 kfree(bufs);
1135 return ret;
1136}
1137
831static int fuse_notify_poll(struct fuse_conn *fc, unsigned int size, 1138static int fuse_notify_poll(struct fuse_conn *fc, unsigned int size,
832 struct fuse_copy_state *cs) 1139 struct fuse_copy_state *cs)
833{ 1140{
@@ -987,23 +1294,17 @@ static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out,
987 * it from the list and copy the rest of the buffer to the request. 1294 * it from the list and copy the rest of the buffer to the request.
988 * The request is finished by calling request_end() 1295 * The request is finished by calling request_end()
989 */ 1296 */
990static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov, 1297static ssize_t fuse_dev_do_write(struct fuse_conn *fc,
991 unsigned long nr_segs, loff_t pos) 1298 struct fuse_copy_state *cs, size_t nbytes)
992{ 1299{
993 int err; 1300 int err;
994 size_t nbytes = iov_length(iov, nr_segs);
995 struct fuse_req *req; 1301 struct fuse_req *req;
996 struct fuse_out_header oh; 1302 struct fuse_out_header oh;
997 struct fuse_copy_state cs;
998 struct fuse_conn *fc = fuse_get_conn(iocb->ki_filp);
999 if (!fc)
1000 return -EPERM;
1001 1303
1002 fuse_copy_init(&cs, fc, 0, NULL, iov, nr_segs);
1003 if (nbytes < sizeof(struct fuse_out_header)) 1304 if (nbytes < sizeof(struct fuse_out_header))
1004 return -EINVAL; 1305 return -EINVAL;
1005 1306
1006 err = fuse_copy_one(&cs, &oh, sizeof(oh)); 1307 err = fuse_copy_one(cs, &oh, sizeof(oh));
1007 if (err) 1308 if (err)
1008 goto err_finish; 1309 goto err_finish;
1009 1310
@@ -1016,7 +1317,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
1016 * and error contains notification code. 1317 * and error contains notification code.
1017 */ 1318 */
1018 if (!oh.unique) { 1319 if (!oh.unique) {
1019 err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), &cs); 1320 err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), cs);
1020 return err ? err : nbytes; 1321 return err ? err : nbytes;
1021 } 1322 }
1022 1323
@@ -1035,7 +1336,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
1035 1336
1036 if (req->aborted) { 1337 if (req->aborted) {
1037 spin_unlock(&fc->lock); 1338 spin_unlock(&fc->lock);
1038 fuse_copy_finish(&cs); 1339 fuse_copy_finish(cs);
1039 spin_lock(&fc->lock); 1340 spin_lock(&fc->lock);
1040 request_end(fc, req); 1341 request_end(fc, req);
1041 return -ENOENT; 1342 return -ENOENT;
@@ -1052,7 +1353,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
1052 queue_interrupt(fc, req); 1353 queue_interrupt(fc, req);
1053 1354
1054 spin_unlock(&fc->lock); 1355 spin_unlock(&fc->lock);
1055 fuse_copy_finish(&cs); 1356 fuse_copy_finish(cs);
1056 return nbytes; 1357 return nbytes;
1057 } 1358 }
1058 1359
@@ -1060,11 +1361,13 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
1060 list_move(&req->list, &fc->io); 1361 list_move(&req->list, &fc->io);
1061 req->out.h = oh; 1362 req->out.h = oh;
1062 req->locked = 1; 1363 req->locked = 1;
1063 cs.req = req; 1364 cs->req = req;
1365 if (!req->out.page_replace)
1366 cs->move_pages = 0;
1064 spin_unlock(&fc->lock); 1367 spin_unlock(&fc->lock);
1065 1368
1066 err = copy_out_args(&cs, &req->out, nbytes); 1369 err = copy_out_args(cs, &req->out, nbytes);
1067 fuse_copy_finish(&cs); 1370 fuse_copy_finish(cs);
1068 1371
1069 spin_lock(&fc->lock); 1372 spin_lock(&fc->lock);
1070 req->locked = 0; 1373 req->locked = 0;
@@ -1080,10 +1383,101 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
1080 err_unlock: 1383 err_unlock:
1081 spin_unlock(&fc->lock); 1384 spin_unlock(&fc->lock);
1082 err_finish: 1385 err_finish:
1083 fuse_copy_finish(&cs); 1386 fuse_copy_finish(cs);
1084 return err; 1387 return err;
1085} 1388}
1086 1389
1390static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
1391 unsigned long nr_segs, loff_t pos)
1392{
1393 struct fuse_copy_state cs;
1394 struct fuse_conn *fc = fuse_get_conn(iocb->ki_filp);
1395 if (!fc)
1396 return -EPERM;
1397
1398 fuse_copy_init(&cs, fc, 0, iov, nr_segs);
1399
1400 return fuse_dev_do_write(fc, &cs, iov_length(iov, nr_segs));
1401}
1402
1403static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
1404 struct file *out, loff_t *ppos,
1405 size_t len, unsigned int flags)
1406{
1407 unsigned nbuf;
1408 unsigned idx;
1409 struct pipe_buffer *bufs;
1410 struct fuse_copy_state cs;
1411 struct fuse_conn *fc;
1412 size_t rem;
1413 ssize_t ret;
1414
1415 fc = fuse_get_conn(out);
1416 if (!fc)
1417 return -EPERM;
1418
1419 bufs = kmalloc(pipe->buffers * sizeof (struct pipe_buffer), GFP_KERNEL);
1420 if (!bufs)
1421 return -ENOMEM;
1422
1423 pipe_lock(pipe);
1424 nbuf = 0;
1425 rem = 0;
1426 for (idx = 0; idx < pipe->nrbufs && rem < len; idx++)
1427 rem += pipe->bufs[(pipe->curbuf + idx) & (pipe->buffers - 1)].len;
1428
1429 ret = -EINVAL;
1430 if (rem < len) {
1431 pipe_unlock(pipe);
1432 goto out;
1433 }
1434
1435 rem = len;
1436 while (rem) {
1437 struct pipe_buffer *ibuf;
1438 struct pipe_buffer *obuf;
1439
1440 BUG_ON(nbuf >= pipe->buffers);
1441 BUG_ON(!pipe->nrbufs);
1442 ibuf = &pipe->bufs[pipe->curbuf];
1443 obuf = &bufs[nbuf];
1444
1445 if (rem >= ibuf->len) {
1446 *obuf = *ibuf;
1447 ibuf->ops = NULL;
1448 pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
1449 pipe->nrbufs--;
1450 } else {
1451 ibuf->ops->get(pipe, ibuf);
1452 *obuf = *ibuf;
1453 obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1454 obuf->len = rem;
1455 ibuf->offset += obuf->len;
1456 ibuf->len -= obuf->len;
1457 }
1458 nbuf++;
1459 rem -= obuf->len;
1460 }
1461 pipe_unlock(pipe);
1462
1463 fuse_copy_init(&cs, fc, 0, NULL, nbuf);
1464 cs.pipebufs = bufs;
1465 cs.pipe = pipe;
1466
1467 if (flags & SPLICE_F_MOVE)
1468 cs.move_pages = 1;
1469
1470 ret = fuse_dev_do_write(fc, &cs, len);
1471
1472 for (idx = 0; idx < nbuf; idx++) {
1473 struct pipe_buffer *buf = &bufs[idx];
1474 buf->ops->release(pipe, buf);
1475 }
1476out:
1477 kfree(bufs);
1478 return ret;
1479}
1480
1087static unsigned fuse_dev_poll(struct file *file, poll_table *wait) 1481static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
1088{ 1482{
1089 unsigned mask = POLLOUT | POLLWRNORM; 1483 unsigned mask = POLLOUT | POLLWRNORM;
@@ -1225,8 +1619,10 @@ const struct file_operations fuse_dev_operations = {
1225 .llseek = no_llseek, 1619 .llseek = no_llseek,
1226 .read = do_sync_read, 1620 .read = do_sync_read,
1227 .aio_read = fuse_dev_read, 1621 .aio_read = fuse_dev_read,
1622 .splice_read = fuse_dev_splice_read,
1228 .write = do_sync_write, 1623 .write = do_sync_write,
1229 .aio_write = fuse_dev_write, 1624 .aio_write = fuse_dev_write,
1625 .splice_write = fuse_dev_splice_write,
1230 .poll = fuse_dev_poll, 1626 .poll = fuse_dev_poll,
1231 .release = fuse_dev_release, 1627 .release = fuse_dev_release,
1232 .fasync = fuse_dev_fasync, 1628 .fasync = fuse_dev_fasync,
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 4787ae6c5c1c..3cdc5f78a406 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1156,10 +1156,9 @@ static int fuse_dir_release(struct inode *inode, struct file *file)
1156 return 0; 1156 return 0;
1157} 1157}
1158 1158
1159static int fuse_dir_fsync(struct file *file, struct dentry *de, int datasync) 1159static int fuse_dir_fsync(struct file *file, int datasync)
1160{ 1160{
1161 /* nfsd can call this with no file */ 1161 return fuse_fsync_common(file, datasync, 1);
1162 return file ? fuse_fsync_common(file, de, datasync, 1) : 0;
1163} 1162}
1164 1163
1165static bool update_mtime(unsigned ivalid) 1164static bool update_mtime(unsigned ivalid)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index a9f5e137f1d3..ada0adeb3bb5 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -351,10 +351,9 @@ static void fuse_sync_writes(struct inode *inode)
351 fuse_release_nowrite(inode); 351 fuse_release_nowrite(inode);
352} 352}
353 353
354int fuse_fsync_common(struct file *file, struct dentry *de, int datasync, 354int fuse_fsync_common(struct file *file, int datasync, int isdir)
355 int isdir)
356{ 355{
357 struct inode *inode = de->d_inode; 356 struct inode *inode = file->f_mapping->host;
358 struct fuse_conn *fc = get_fuse_conn(inode); 357 struct fuse_conn *fc = get_fuse_conn(inode);
359 struct fuse_file *ff = file->private_data; 358 struct fuse_file *ff = file->private_data;
360 struct fuse_req *req; 359 struct fuse_req *req;
@@ -403,9 +402,9 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
403 return err; 402 return err;
404} 403}
405 404
406static int fuse_fsync(struct file *file, struct dentry *de, int datasync) 405static int fuse_fsync(struct file *file, int datasync)
407{ 406{
408 return fuse_fsync_common(file, de, datasync, 0); 407 return fuse_fsync_common(file, datasync, 0);
409} 408}
410 409
411void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos, 410void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos,
@@ -517,17 +516,26 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
517 int i; 516 int i;
518 size_t count = req->misc.read.in.size; 517 size_t count = req->misc.read.in.size;
519 size_t num_read = req->out.args[0].size; 518 size_t num_read = req->out.args[0].size;
520 struct inode *inode = req->pages[0]->mapping->host; 519 struct address_space *mapping = NULL;
521 520
522 /* 521 for (i = 0; mapping == NULL && i < req->num_pages; i++)
523 * Short read means EOF. If file size is larger, truncate it 522 mapping = req->pages[i]->mapping;
524 */
525 if (!req->out.h.error && num_read < count) {
526 loff_t pos = page_offset(req->pages[0]) + num_read;
527 fuse_read_update_size(inode, pos, req->misc.read.attr_ver);
528 }
529 523
530 fuse_invalidate_attr(inode); /* atime changed */ 524 if (mapping) {
525 struct inode *inode = mapping->host;
526
527 /*
528 * Short read means EOF. If file size is larger, truncate it
529 */
530 if (!req->out.h.error && num_read < count) {
531 loff_t pos;
532
533 pos = page_offset(req->pages[0]) + num_read;
534 fuse_read_update_size(inode, pos,
535 req->misc.read.attr_ver);
536 }
537 fuse_invalidate_attr(inode); /* atime changed */
538 }
531 539
532 for (i = 0; i < req->num_pages; i++) { 540 for (i = 0; i < req->num_pages; i++) {
533 struct page *page = req->pages[i]; 541 struct page *page = req->pages[i];
@@ -536,6 +544,7 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
536 else 544 else
537 SetPageError(page); 545 SetPageError(page);
538 unlock_page(page); 546 unlock_page(page);
547 page_cache_release(page);
539 } 548 }
540 if (req->ff) 549 if (req->ff)
541 fuse_file_put(req->ff); 550 fuse_file_put(req->ff);
@@ -550,6 +559,7 @@ static void fuse_send_readpages(struct fuse_req *req, struct file *file)
550 559
551 req->out.argpages = 1; 560 req->out.argpages = 1;
552 req->out.page_zeroing = 1; 561 req->out.page_zeroing = 1;
562 req->out.page_replace = 1;
553 fuse_read_fill(req, file, pos, count, FUSE_READ); 563 fuse_read_fill(req, file, pos, count, FUSE_READ);
554 req->misc.read.attr_ver = fuse_get_attr_version(fc); 564 req->misc.read.attr_ver = fuse_get_attr_version(fc);
555 if (fc->async_read) { 565 if (fc->async_read) {
@@ -589,6 +599,7 @@ static int fuse_readpages_fill(void *_data, struct page *page)
589 return PTR_ERR(req); 599 return PTR_ERR(req);
590 } 600 }
591 } 601 }
602 page_cache_get(page);
592 req->pages[req->num_pages] = page; 603 req->pages[req->num_pages] = page;
593 req->num_pages++; 604 req->num_pages++;
594 return 0; 605 return 0;
@@ -994,10 +1005,7 @@ static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf,
994 nbytes = min_t(size_t, nbytes, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT); 1005 nbytes = min_t(size_t, nbytes, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT);
995 npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; 1006 npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
996 npages = clamp(npages, 1, FUSE_MAX_PAGES_PER_REQ); 1007 npages = clamp(npages, 1, FUSE_MAX_PAGES_PER_REQ);
997 down_read(&current->mm->mmap_sem); 1008 npages = get_user_pages_fast(user_addr, npages, !write, req->pages);
998 npages = get_user_pages(current, current->mm, user_addr, npages, !write,
999 0, req->pages, NULL);
1000 up_read(&current->mm->mmap_sem);
1001 if (npages < 0) 1009 if (npages < 0)
1002 return npages; 1010 return npages;
1003 1011
@@ -1580,9 +1588,9 @@ static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov,
1580 while (iov_iter_count(&ii)) { 1588 while (iov_iter_count(&ii)) {
1581 struct page *page = pages[page_idx++]; 1589 struct page *page = pages[page_idx++];
1582 size_t todo = min_t(size_t, PAGE_SIZE, iov_iter_count(&ii)); 1590 size_t todo = min_t(size_t, PAGE_SIZE, iov_iter_count(&ii));
1583 void *kaddr, *map; 1591 void *kaddr;
1584 1592
1585 kaddr = map = kmap(page); 1593 kaddr = kmap(page);
1586 1594
1587 while (todo) { 1595 while (todo) {
1588 char __user *uaddr = ii.iov->iov_base + ii.iov_offset; 1596 char __user *uaddr = ii.iov->iov_base + ii.iov_offset;
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 01cc462ff45d..8f309f04064e 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -177,6 +177,9 @@ struct fuse_out {
177 /** Zero partially or not copied pages */ 177 /** Zero partially or not copied pages */
178 unsigned page_zeroing:1; 178 unsigned page_zeroing:1;
179 179
180 /** Pages may be replaced with new ones */
181 unsigned page_replace:1;
182
180 /** Number or arguments */ 183 /** Number or arguments */
181 unsigned numargs; 184 unsigned numargs;
182 185
@@ -568,8 +571,7 @@ void fuse_release_common(struct file *file, int opcode);
568/** 571/**
569 * Send FSYNC or FSYNCDIR request 572 * Send FSYNC or FSYNCDIR request
570 */ 573 */
571int fuse_fsync_common(struct file *file, struct dentry *de, int datasync, 574int fuse_fsync_common(struct file *file, int datasync, int isdir);
572 int isdir);
573 575
574/** 576/**
575 * Notify poll wakeup 577 * Notify poll wakeup
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
index 55458031e501..99800e564157 100644
--- a/fs/generic_acl.c
+++ b/fs/generic_acl.c
@@ -7,6 +7,7 @@
7 */ 7 */
8 8
9#include <linux/sched.h> 9#include <linux/sched.h>
10#include <linux/gfp.h>
10#include <linux/fs.h> 11#include <linux/fs.h>
11#include <linux/generic_acl.h> 12#include <linux/generic_acl.h>
12#include <linux/posix_acl.h> 13#include <linux/posix_acl.h>
@@ -200,7 +201,7 @@ generic_check_acl(struct inode *inode, int mask)
200 return -EAGAIN; 201 return -EAGAIN;
201} 202}
202 203
203struct xattr_handler generic_acl_access_handler = { 204const struct xattr_handler generic_acl_access_handler = {
204 .prefix = POSIX_ACL_XATTR_ACCESS, 205 .prefix = POSIX_ACL_XATTR_ACCESS,
205 .flags = ACL_TYPE_ACCESS, 206 .flags = ACL_TYPE_ACCESS,
206 .list = generic_acl_list, 207 .list = generic_acl_list,
@@ -208,7 +209,7 @@ struct xattr_handler generic_acl_access_handler = {
208 .set = generic_acl_set, 209 .set = generic_acl_set,
209}; 210};
210 211
211struct xattr_handler generic_acl_default_handler = { 212const struct xattr_handler generic_acl_default_handler = {
212 .prefix = POSIX_ACL_XATTR_DEFAULT, 213 .prefix = POSIX_ACL_XATTR_DEFAULT,
213 .flags = ACL_TYPE_DEFAULT, 214 .flags = ACL_TYPE_DEFAULT,
214 .list = generic_acl_list, 215 .list = generic_acl_list,
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 87ee309d4c24..48171f4c943d 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -236,10 +236,14 @@ static int gfs2_xattr_system_get(struct dentry *dentry, const char *name,
236 void *buffer, size_t size, int xtype) 236 void *buffer, size_t size, int xtype)
237{ 237{
238 struct inode *inode = dentry->d_inode; 238 struct inode *inode = dentry->d_inode;
239 struct gfs2_sbd *sdp = GFS2_SB(inode);
239 struct posix_acl *acl; 240 struct posix_acl *acl;
240 int type; 241 int type;
241 int error; 242 int error;
242 243
244 if (!sdp->sd_args.ar_posix_acl)
245 return -EOPNOTSUPP;
246
243 type = gfs2_acl_type(name); 247 type = gfs2_acl_type(name);
244 if (type < 0) 248 if (type < 0)
245 return type; 249 return type;
@@ -335,7 +339,7 @@ out:
335 return error; 339 return error;
336} 340}
337 341
338struct xattr_handler gfs2_xattr_system_handler = { 342const struct xattr_handler gfs2_xattr_system_handler = {
339 .prefix = XATTR_SYSTEM_PREFIX, 343 .prefix = XATTR_SYSTEM_PREFIX,
340 .flags = GFS2_EATYPE_SYS, 344 .flags = GFS2_EATYPE_SYS,
341 .get = gfs2_xattr_system_get, 345 .get = gfs2_xattr_system_get,
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
index 9306a2e6620c..b522b0cb39ea 100644
--- a/fs/gfs2/acl.h
+++ b/fs/gfs2/acl.h
@@ -19,6 +19,6 @@
19extern int gfs2_check_acl(struct inode *inode, int mask); 19extern int gfs2_check_acl(struct inode *inode, int mask);
20extern int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode); 20extern int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode);
21extern int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr); 21extern int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr);
22extern struct xattr_handler gfs2_xattr_system_handler; 22extern const struct xattr_handler gfs2_xattr_system_handler;
23 23
24#endif /* __ACL_DOT_H__ */ 24#endif /* __ACL_DOT_H__ */
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 0c1d0b82dcf1..9f8b52500d63 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -418,6 +418,7 @@ static int gfs2_jdata_writepages(struct address_space *mapping,
418static int stuffed_readpage(struct gfs2_inode *ip, struct page *page) 418static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
419{ 419{
420 struct buffer_head *dibh; 420 struct buffer_head *dibh;
421 u64 dsize = i_size_read(&ip->i_inode);
421 void *kaddr; 422 void *kaddr;
422 int error; 423 int error;
423 424
@@ -437,9 +438,10 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
437 return error; 438 return error;
438 439
439 kaddr = kmap_atomic(page, KM_USER0); 440 kaddr = kmap_atomic(page, KM_USER0);
440 memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), 441 if (dsize > (dibh->b_size - sizeof(struct gfs2_dinode)))
441 ip->i_disksize); 442 dsize = (dibh->b_size - sizeof(struct gfs2_dinode));
442 memset(kaddr + ip->i_disksize, 0, PAGE_CACHE_SIZE - ip->i_disksize); 443 memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
444 memset(kaddr + dsize, 0, PAGE_CACHE_SIZE - dsize);
443 kunmap_atomic(kaddr, KM_USER0); 445 kunmap_atomic(kaddr, KM_USER0);
444 flush_dcache_page(page); 446 flush_dcache_page(page);
445 brelse(dibh); 447 brelse(dibh);
@@ -698,8 +700,14 @@ out:
698 return 0; 700 return 0;
699 701
700 page_cache_release(page); 702 page_cache_release(page);
703
704 /*
705 * XXX(hch): the call below should probably be replaced with
706 * a call to the gfs2-specific truncate blocks helper to actually
707 * release disk blocks..
708 */
701 if (pos + len > ip->i_inode.i_size) 709 if (pos + len > ip->i_inode.i_size)
702 vmtruncate(&ip->i_inode, ip->i_inode.i_size); 710 simple_setsize(&ip->i_inode, ip->i_inode.i_size);
703out_endtrans: 711out_endtrans:
704 gfs2_trans_end(sdp); 712 gfs2_trans_end(sdp);
705out_trans_fail: 713out_trans_fail:
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 583e823307ae..4a48c0f4b402 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -7,7 +7,6 @@
7 * of the GNU General Public License version 2. 7 * of the GNU General Public License version 2.
8 */ 8 */
9 9
10#include <linux/slab.h>
11#include <linux/spinlock.h> 10#include <linux/spinlock.h>
12#include <linux/completion.h> 11#include <linux/completion.h>
13#include <linux/buffer_head.h> 12#include <linux/buffer_head.h>
@@ -72,11 +71,13 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
72 71
73 if (!PageUptodate(page)) { 72 if (!PageUptodate(page)) {
74 void *kaddr = kmap(page); 73 void *kaddr = kmap(page);
74 u64 dsize = i_size_read(inode);
75
76 if (dsize > (dibh->b_size - sizeof(struct gfs2_dinode)))
77 dsize = dibh->b_size - sizeof(struct gfs2_dinode);
75 78
76 memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), 79 memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
77 ip->i_disksize); 80 memset(kaddr + dsize, 0, PAGE_CACHE_SIZE - dsize);
78 memset(kaddr + ip->i_disksize, 0,
79 PAGE_CACHE_SIZE - ip->i_disksize);
80 kunmap(page); 81 kunmap(page);
81 82
82 SetPageUptodate(page); 83 SetPageUptodate(page);
@@ -1039,13 +1040,14 @@ static int trunc_start(struct gfs2_inode *ip, u64 size)
1039 goto out; 1040 goto out;
1040 1041
1041 if (gfs2_is_stuffed(ip)) { 1042 if (gfs2_is_stuffed(ip)) {
1042 ip->i_disksize = size; 1043 u64 dsize = size + sizeof(struct gfs2_inode);
1043 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; 1044 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
1044 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 1045 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1045 gfs2_dinode_out(ip, dibh->b_data); 1046 gfs2_dinode_out(ip, dibh->b_data);
1046 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + size); 1047 if (dsize > dibh->b_size)
1048 dsize = dibh->b_size;
1049 gfs2_buffer_clear_tail(dibh, dsize);
1047 error = 1; 1050 error = 1;
1048
1049 } else { 1051 } else {
1050 if (size & (u64)(sdp->sd_sb.sb_bsize - 1)) 1052 if (size & (u64)(sdp->sd_sb.sb_bsize - 1))
1051 error = gfs2_block_truncate_page(ip->i_inode.i_mapping); 1053 error = gfs2_block_truncate_page(ip->i_inode.i_mapping);
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index 91beddadd388..bb7907bde3d8 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -7,7 +7,6 @@
7 * of the GNU General Public License version 2. 7 * of the GNU General Public License version 2.
8 */ 8 */
9 9
10#include <linux/slab.h>
11#include <linux/spinlock.h> 10#include <linux/spinlock.h>
12#include <linux/completion.h> 11#include <linux/completion.h>
13#include <linux/buffer_head.h> 12#include <linux/buffer_head.h>
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 25fddc100f18..8295c5b5d4a9 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1475,7 +1475,7 @@ struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name)
1475 inode = gfs2_inode_lookup(dir->i_sb, 1475 inode = gfs2_inode_lookup(dir->i_sb,
1476 be16_to_cpu(dent->de_type), 1476 be16_to_cpu(dent->de_type),
1477 be64_to_cpu(dent->de_inum.no_addr), 1477 be64_to_cpu(dent->de_inum.no_addr),
1478 be64_to_cpu(dent->de_inum.no_formal_ino), 0); 1478 be64_to_cpu(dent->de_inum.no_formal_ino));
1479 brelse(bh); 1479 brelse(bh);
1480 return inode; 1480 return inode;
1481 } 1481 }
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index d15876e9aa26..dfe237a3f8ad 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -7,7 +7,6 @@
7 * of the GNU General Public License version 2. 7 * of the GNU General Public License version 2.
8 */ 8 */
9 9
10#include <linux/slab.h>
11#include <linux/spinlock.h> 10#include <linux/spinlock.h>
12#include <linux/completion.h> 11#include <linux/completion.h>
13#include <linux/buffer_head.h> 12#include <linux/buffer_head.h>
@@ -169,7 +168,7 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
169 if (error) 168 if (error)
170 goto fail; 169 goto fail;
171 170
172 inode = gfs2_inode_lookup(sb, DT_UNKNOWN, inum->no_addr, 0, 0); 171 inode = gfs2_inode_lookup(sb, DT_UNKNOWN, inum->no_addr, 0);
173 if (IS_ERR(inode)) { 172 if (IS_ERR(inode)) {
174 error = PTR_ERR(inode); 173 error = PTR_ERR(inode);
175 goto fail; 174 goto fail;
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index e6dd2aec6f82..ed9a94f0ef15 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -218,6 +218,11 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
218 if (error) 218 if (error)
219 goto out_drop_write; 219 goto out_drop_write;
220 220
221 error = -EACCES;
222 if (!is_owner_or_cap(inode))
223 goto out;
224
225 error = 0;
221 flags = ip->i_diskflags; 226 flags = ip->i_diskflags;
222 new_flags = (flags & ~mask) | (reqflags & mask); 227 new_flags = (flags & ~mask) | (reqflags & mask);
223 if ((new_flags ^ flags) == 0) 228 if ((new_flags ^ flags) == 0)
@@ -275,8 +280,10 @@ static int gfs2_set_flags(struct file *filp, u32 __user *ptr)
275{ 280{
276 struct inode *inode = filp->f_path.dentry->d_inode; 281 struct inode *inode = filp->f_path.dentry->d_inode;
277 u32 fsflags, gfsflags; 282 u32 fsflags, gfsflags;
283
278 if (get_user(fsflags, ptr)) 284 if (get_user(fsflags, ptr))
279 return -EFAULT; 285 return -EFAULT;
286
280 gfsflags = fsflags_cvt(fsflags_to_gfs2, fsflags); 287 gfsflags = fsflags_cvt(fsflags_to_gfs2, fsflags);
281 if (!S_ISDIR(inode->i_mode)) { 288 if (!S_ISDIR(inode->i_mode)) {
282 if (gfsflags & GFS2_DIF_INHERIT_JDATA) 289 if (gfsflags & GFS2_DIF_INHERIT_JDATA)
@@ -547,9 +554,9 @@ static int gfs2_close(struct inode *inode, struct file *file)
547 * Returns: errno 554 * Returns: errno
548 */ 555 */
549 556
550static int gfs2_fsync(struct file *file, struct dentry *dentry, int datasync) 557static int gfs2_fsync(struct file *file, int datasync)
551{ 558{
552 struct inode *inode = dentry->d_inode; 559 struct inode *inode = file->f_mapping->host;
553 int sync_state = inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC); 560 int sync_state = inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC);
554 int ret = 0; 561 int ret = 0;
555 562
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 454d4b4eb36b..ddcdbf493536 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -855,6 +855,9 @@ void gfs2_holder_reinit(unsigned int state, unsigned flags, struct gfs2_holder *
855 gh->gh_flags = flags; 855 gh->gh_flags = flags;
856 gh->gh_iflags = 0; 856 gh->gh_iflags = 0;
857 gh->gh_ip = (unsigned long)__builtin_return_address(0); 857 gh->gh_ip = (unsigned long)__builtin_return_address(0);
858 if (gh->gh_owner_pid)
859 put_pid(gh->gh_owner_pid);
860 gh->gh_owner_pid = get_pid(task_pid(current));
858} 861}
859 862
860/** 863/**
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 38e3749d476c..49f97d3bb690 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -7,7 +7,6 @@
7 * of the GNU General Public License version 2. 7 * of the GNU General Public License version 2.
8 */ 8 */
9 9
10#include <linux/slab.h>
11#include <linux/spinlock.h> 10#include <linux/spinlock.h>
12#include <linux/completion.h> 11#include <linux/completion.h>
13#include <linux/buffer_head.h> 12#include <linux/buffer_head.h>
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 3aac46f6853e..b5d7363b22da 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -439,9 +439,6 @@ struct gfs2_args {
439struct gfs2_tune { 439struct gfs2_tune {
440 spinlock_t gt_spin; 440 spinlock_t gt_spin;
441 441
442 unsigned int gt_incore_log_blocks;
443 unsigned int gt_log_flush_secs;
444
445 unsigned int gt_logd_secs; 442 unsigned int gt_logd_secs;
446 443
447 unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */ 444 unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */
@@ -462,6 +459,7 @@ enum {
462 SDF_SHUTDOWN = 2, 459 SDF_SHUTDOWN = 2,
463 SDF_NOBARRIERS = 3, 460 SDF_NOBARRIERS = 3,
464 SDF_NORECOVERY = 4, 461 SDF_NORECOVERY = 4,
462 SDF_DEMOTE = 5,
465}; 463};
466 464
467#define GFS2_FSNAME_LEN 256 465#define GFS2_FSNAME_LEN 256
@@ -618,6 +616,7 @@ struct gfs2_sbd {
618 unsigned int sd_log_commited_databuf; 616 unsigned int sd_log_commited_databuf;
619 int sd_log_commited_revoke; 617 int sd_log_commited_revoke;
620 618
619 atomic_t sd_log_pinned;
621 unsigned int sd_log_num_buf; 620 unsigned int sd_log_num_buf;
622 unsigned int sd_log_num_revoke; 621 unsigned int sd_log_num_revoke;
623 unsigned int sd_log_num_rg; 622 unsigned int sd_log_num_rg;
@@ -629,15 +628,17 @@ struct gfs2_sbd {
629 struct list_head sd_log_le_databuf; 628 struct list_head sd_log_le_databuf;
630 struct list_head sd_log_le_ordered; 629 struct list_head sd_log_le_ordered;
631 630
631 atomic_t sd_log_thresh1;
632 atomic_t sd_log_thresh2;
632 atomic_t sd_log_blks_free; 633 atomic_t sd_log_blks_free;
633 struct mutex sd_log_reserve_mutex; 634 wait_queue_head_t sd_log_waitq;
635 wait_queue_head_t sd_logd_waitq;
634 636
635 u64 sd_log_sequence; 637 u64 sd_log_sequence;
636 unsigned int sd_log_head; 638 unsigned int sd_log_head;
637 unsigned int sd_log_tail; 639 unsigned int sd_log_tail;
638 int sd_log_idle; 640 int sd_log_idle;
639 641
640 unsigned long sd_log_flush_time;
641 struct rw_semaphore sd_log_flush_lock; 642 struct rw_semaphore sd_log_flush_lock;
642 atomic_t sd_log_in_flight; 643 atomic_t sd_log_in_flight;
643 wait_queue_head_t sd_log_flush_wait; 644 wait_queue_head_t sd_log_flush_wait;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index b1bf2694fb2b..b5612cbb62a5 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -158,7 +158,6 @@ void gfs2_set_iop(struct inode *inode)
158 * @sb: The super block 158 * @sb: The super block
159 * @no_addr: The inode number 159 * @no_addr: The inode number
160 * @type: The type of the inode 160 * @type: The type of the inode
161 * @skip_freeing: set this not return an inode if it is currently being freed.
162 * 161 *
163 * Returns: A VFS inode, or an error 162 * Returns: A VFS inode, or an error
164 */ 163 */
@@ -166,17 +165,14 @@ void gfs2_set_iop(struct inode *inode)
166struct inode *gfs2_inode_lookup(struct super_block *sb, 165struct inode *gfs2_inode_lookup(struct super_block *sb,
167 unsigned int type, 166 unsigned int type,
168 u64 no_addr, 167 u64 no_addr,
169 u64 no_formal_ino, int skip_freeing) 168 u64 no_formal_ino)
170{ 169{
171 struct inode *inode; 170 struct inode *inode;
172 struct gfs2_inode *ip; 171 struct gfs2_inode *ip;
173 struct gfs2_glock *io_gl; 172 struct gfs2_glock *io_gl;
174 int error; 173 int error;
175 174
176 if (skip_freeing) 175 inode = gfs2_iget(sb, no_addr);
177 inode = gfs2_iget_skip(sb, no_addr);
178 else
179 inode = gfs2_iget(sb, no_addr);
180 ip = GFS2_I(inode); 176 ip = GFS2_I(inode);
181 177
182 if (!inode) 178 if (!inode)
@@ -234,11 +230,102 @@ fail_glock:
234fail_iopen: 230fail_iopen:
235 gfs2_glock_put(io_gl); 231 gfs2_glock_put(io_gl);
236fail_put: 232fail_put:
233 if (inode->i_state & I_NEW)
234 ip->i_gl->gl_object = NULL;
235 gfs2_glock_put(ip->i_gl);
236fail:
237 if (inode->i_state & I_NEW)
238 iget_failed(inode);
239 else
240 iput(inode);
241 return ERR_PTR(error);
242}
243
244/**
245 * gfs2_process_unlinked_inode - Lookup an unlinked inode for reclamation
246 * and try to reclaim it by doing iput.
247 *
248 * This function assumes no rgrp locks are currently held.
249 *
250 * @sb: The super block
251 * no_addr: The inode number
252 *
253 */
254
255void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr)
256{
257 struct gfs2_sbd *sdp;
258 struct gfs2_inode *ip;
259 struct gfs2_glock *io_gl;
260 int error;
261 struct gfs2_holder gh;
262 struct inode *inode;
263
264 inode = gfs2_iget_skip(sb, no_addr);
265
266 if (!inode)
267 return;
268
269 /* If it's not a new inode, someone's using it, so leave it alone. */
270 if (!(inode->i_state & I_NEW)) {
271 iput(inode);
272 return;
273 }
274
275 ip = GFS2_I(inode);
276 sdp = GFS2_SB(inode);
277 ip->i_no_formal_ino = -1;
278
279 error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl);
280 if (unlikely(error))
281 goto fail;
282 ip->i_gl->gl_object = ip;
283
284 error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, &io_gl);
285 if (unlikely(error))
286 goto fail_put;
287
288 set_bit(GIF_INVALID, &ip->i_flags);
289 error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, LM_FLAG_TRY | GL_EXACT,
290 &ip->i_iopen_gh);
291 if (unlikely(error))
292 goto fail_iopen;
293
294 ip->i_iopen_gh.gh_gl->gl_object = ip;
295 gfs2_glock_put(io_gl);
296
297 inode->i_mode = DT2IF(DT_UNKNOWN);
298
299 /*
300 * We must read the inode in order to work out its type in
301 * this case. Note that this doesn't happen often as we normally
302 * know the type beforehand. This code path only occurs during
303 * unlinked inode recovery (where it is safe to do this glock,
304 * which is not true in the general case).
305 */
306 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, LM_FLAG_TRY,
307 &gh);
308 if (unlikely(error))
309 goto fail_glock;
310
311 /* Inode is now uptodate */
312 gfs2_glock_dq_uninit(&gh);
313 gfs2_set_iop(inode);
314
315 /* The iput will cause it to be deleted. */
316 iput(inode);
317 return;
318
319fail_glock:
320 gfs2_glock_dq(&ip->i_iopen_gh);
321fail_iopen:
322 gfs2_glock_put(io_gl);
323fail_put:
237 ip->i_gl->gl_object = NULL; 324 ip->i_gl->gl_object = NULL;
238 gfs2_glock_put(ip->i_gl); 325 gfs2_glock_put(ip->i_gl);
239fail: 326fail:
240 iget_failed(inode); 327 iget_failed(inode);
241 return ERR_PTR(error); 328 return;
242} 329}
243 330
244static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) 331static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
@@ -862,7 +949,7 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
862 goto fail_gunlock2; 949 goto fail_gunlock2;
863 950
864 inode = gfs2_inode_lookup(dir->i_sb, IF2DT(mode), inum.no_addr, 951 inode = gfs2_inode_lookup(dir->i_sb, IF2DT(mode), inum.no_addr,
865 inum.no_formal_ino, 0); 952 inum.no_formal_ino);
866 if (IS_ERR(inode)) 953 if (IS_ERR(inode))
867 goto fail_gunlock2; 954 goto fail_gunlock2;
868 955
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index c341aaf67adb..300ada3f21de 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -83,8 +83,8 @@ static inline void gfs2_inum_out(const struct gfs2_inode *ip,
83 83
84extern void gfs2_set_iop(struct inode *inode); 84extern void gfs2_set_iop(struct inode *inode);
85extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 85extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type,
86 u64 no_addr, u64 no_formal_ino, 86 u64 no_addr, u64 no_formal_ino);
87 int skip_freeing); 87extern void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr);
88extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr); 88extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr);
89 89
90extern int gfs2_inode_refresh(struct gfs2_inode *ip); 90extern int gfs2_inode_refresh(struct gfs2_inode *ip);
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 569b46240f61..0e0470ed34c2 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -9,6 +9,7 @@
9 9
10#include <linux/fs.h> 10#include <linux/fs.h>
11#include <linux/dlm.h> 11#include <linux/dlm.h>
12#include <linux/slab.h>
12#include <linux/types.h> 13#include <linux/types.h>
13#include <linux/gfs2_ondisk.h> 14#include <linux/gfs2_ondisk.h>
14 15
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index e5bf4b59d46e..6a857e24f947 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -168,12 +168,11 @@ static int gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai, int fl
168 return list_empty(&ai->ai_ail1_list); 168 return list_empty(&ai->ai_ail1_list);
169} 169}
170 170
171static void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags) 171static void gfs2_ail1_start(struct gfs2_sbd *sdp)
172{ 172{
173 struct list_head *head; 173 struct list_head *head;
174 u64 sync_gen; 174 u64 sync_gen;
175 struct list_head *first; 175 struct gfs2_ail *ai;
176 struct gfs2_ail *first_ai, *ai, *tmp;
177 int done = 0; 176 int done = 0;
178 177
179 gfs2_log_lock(sdp); 178 gfs2_log_lock(sdp);
@@ -184,21 +183,9 @@ static void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags)
184 } 183 }
185 sync_gen = sdp->sd_ail_sync_gen++; 184 sync_gen = sdp->sd_ail_sync_gen++;
186 185
187 first = head->prev;
188 first_ai = list_entry(first, struct gfs2_ail, ai_list);
189 first_ai->ai_sync_gen = sync_gen;
190 gfs2_ail1_start_one(sdp, first_ai); /* This may drop log lock */
191
192 if (flags & DIO_ALL)
193 first = NULL;
194
195 while(!done) { 186 while(!done) {
196 if (first && (head->prev != first ||
197 gfs2_ail1_empty_one(sdp, first_ai, 0)))
198 break;
199
200 done = 1; 187 done = 1;
201 list_for_each_entry_safe_reverse(ai, tmp, head, ai_list) { 188 list_for_each_entry_reverse(ai, head, ai_list) {
202 if (ai->ai_sync_gen >= sync_gen) 189 if (ai->ai_sync_gen >= sync_gen)
203 continue; 190 continue;
204 ai->ai_sync_gen = sync_gen; 191 ai->ai_sync_gen = sync_gen;
@@ -290,58 +277,57 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail)
290 * flush time, so we ensure that we have just enough free blocks at all 277 * flush time, so we ensure that we have just enough free blocks at all
291 * times to avoid running out during a log flush. 278 * times to avoid running out during a log flush.
292 * 279 *
280 * We no longer flush the log here, instead we wake up logd to do that
281 * for us. To avoid the thundering herd and to ensure that we deal fairly
282 * with queued waiters, we use an exclusive wait. This means that when we
283 * get woken with enough journal space to get our reservation, we need to
284 * wake the next waiter on the list.
285 *
293 * Returns: errno 286 * Returns: errno
294 */ 287 */
295 288
296int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks) 289int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks)
297{ 290{
298 unsigned int try = 0;
299 unsigned reserved_blks = 6 * (4096 / sdp->sd_vfs->s_blocksize); 291 unsigned reserved_blks = 6 * (4096 / sdp->sd_vfs->s_blocksize);
292 unsigned wanted = blks + reserved_blks;
293 DEFINE_WAIT(wait);
294 int did_wait = 0;
295 unsigned int free_blocks;
300 296
301 if (gfs2_assert_warn(sdp, blks) || 297 if (gfs2_assert_warn(sdp, blks) ||
302 gfs2_assert_warn(sdp, blks <= sdp->sd_jdesc->jd_blocks)) 298 gfs2_assert_warn(sdp, blks <= sdp->sd_jdesc->jd_blocks))
303 return -EINVAL; 299 return -EINVAL;
304 300retry:
305 mutex_lock(&sdp->sd_log_reserve_mutex); 301 free_blocks = atomic_read(&sdp->sd_log_blks_free);
306 gfs2_log_lock(sdp); 302 if (unlikely(free_blocks <= wanted)) {
307 while(atomic_read(&sdp->sd_log_blks_free) <= (blks + reserved_blks)) { 303 do {
308 gfs2_log_unlock(sdp); 304 prepare_to_wait_exclusive(&sdp->sd_log_waitq, &wait,
309 gfs2_ail1_empty(sdp, 0); 305 TASK_UNINTERRUPTIBLE);
310 gfs2_log_flush(sdp, NULL); 306 wake_up(&sdp->sd_logd_waitq);
311 307 did_wait = 1;
312 if (try++) 308 if (atomic_read(&sdp->sd_log_blks_free) <= wanted)
313 gfs2_ail1_start(sdp, 0); 309 io_schedule();
314 gfs2_log_lock(sdp); 310 free_blocks = atomic_read(&sdp->sd_log_blks_free);
311 } while(free_blocks <= wanted);
312 finish_wait(&sdp->sd_log_waitq, &wait);
315 } 313 }
316 atomic_sub(blks, &sdp->sd_log_blks_free); 314 if (atomic_cmpxchg(&sdp->sd_log_blks_free, free_blocks,
315 free_blocks - blks) != free_blocks)
316 goto retry;
317 trace_gfs2_log_blocks(sdp, -blks); 317 trace_gfs2_log_blocks(sdp, -blks);
318 gfs2_log_unlock(sdp); 318
319 mutex_unlock(&sdp->sd_log_reserve_mutex); 319 /*
320 * If we waited, then so might others, wake them up _after_ we get
321 * our share of the log.
322 */
323 if (unlikely(did_wait))
324 wake_up(&sdp->sd_log_waitq);
320 325
321 down_read(&sdp->sd_log_flush_lock); 326 down_read(&sdp->sd_log_flush_lock);
322 327
323 return 0; 328 return 0;
324} 329}
325 330
326/**
327 * gfs2_log_release - Release a given number of log blocks
328 * @sdp: The GFS2 superblock
329 * @blks: The number of blocks
330 *
331 */
332
333void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks)
334{
335
336 gfs2_log_lock(sdp);
337 atomic_add(blks, &sdp->sd_log_blks_free);
338 trace_gfs2_log_blocks(sdp, blks);
339 gfs2_assert_withdraw(sdp,
340 atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks);
341 gfs2_log_unlock(sdp);
342 up_read(&sdp->sd_log_flush_lock);
343}
344
345static u64 log_bmap(struct gfs2_sbd *sdp, unsigned int lbn) 331static u64 log_bmap(struct gfs2_sbd *sdp, unsigned int lbn)
346{ 332{
347 struct gfs2_journal_extent *je; 333 struct gfs2_journal_extent *je;
@@ -559,11 +545,10 @@ static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail)
559 545
560 ail2_empty(sdp, new_tail); 546 ail2_empty(sdp, new_tail);
561 547
562 gfs2_log_lock(sdp);
563 atomic_add(dist, &sdp->sd_log_blks_free); 548 atomic_add(dist, &sdp->sd_log_blks_free);
564 trace_gfs2_log_blocks(sdp, dist); 549 trace_gfs2_log_blocks(sdp, dist);
565 gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks); 550 gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <=
566 gfs2_log_unlock(sdp); 551 sdp->sd_jdesc->jd_blocks);
567 552
568 sdp->sd_log_tail = new_tail; 553 sdp->sd_log_tail = new_tail;
569} 554}
@@ -615,6 +600,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
615 if (buffer_eopnotsupp(bh)) { 600 if (buffer_eopnotsupp(bh)) {
616 clear_buffer_eopnotsupp(bh); 601 clear_buffer_eopnotsupp(bh);
617 set_buffer_uptodate(bh); 602 set_buffer_uptodate(bh);
603 fs_info(sdp, "barrier sync failed - disabling barriers\n");
618 set_bit(SDF_NOBARRIERS, &sdp->sd_flags); 604 set_bit(SDF_NOBARRIERS, &sdp->sd_flags);
619 lock_buffer(bh); 605 lock_buffer(bh);
620skip_barrier: 606skip_barrier:
@@ -710,7 +696,7 @@ static void gfs2_ordered_wait(struct gfs2_sbd *sdp)
710 * 696 *
711 */ 697 */
712 698
713void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl) 699void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
714{ 700{
715 struct gfs2_ail *ai; 701 struct gfs2_ail *ai;
716 702
@@ -822,6 +808,13 @@ static void buf_lo_incore_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
822 * @sdp: the filesystem 808 * @sdp: the filesystem
823 * @tr: the transaction 809 * @tr: the transaction
824 * 810 *
811 * We wake up gfs2_logd if the number of pinned blocks exceed thresh1
812 * or the total number of used blocks (pinned blocks plus AIL blocks)
813 * is greater than thresh2.
814 *
815 * At mount time thresh1 is 1/3rd of journal size, thresh2 is 2/3rd of
816 * journal size.
817 *
825 * Returns: errno 818 * Returns: errno
826 */ 819 */
827 820
@@ -832,10 +825,10 @@ void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
832 825
833 up_read(&sdp->sd_log_flush_lock); 826 up_read(&sdp->sd_log_flush_lock);
834 827
835 gfs2_log_lock(sdp); 828 if (atomic_read(&sdp->sd_log_pinned) > atomic_read(&sdp->sd_log_thresh1) ||
836 if (sdp->sd_log_num_buf > gfs2_tune_get(sdp, gt_incore_log_blocks)) 829 ((sdp->sd_jdesc->jd_blocks - atomic_read(&sdp->sd_log_blks_free)) >
837 wake_up_process(sdp->sd_logd_process); 830 atomic_read(&sdp->sd_log_thresh2)))
838 gfs2_log_unlock(sdp); 831 wake_up(&sdp->sd_logd_waitq);
839} 832}
840 833
841/** 834/**
@@ -882,13 +875,23 @@ void gfs2_meta_syncfs(struct gfs2_sbd *sdp)
882{ 875{
883 gfs2_log_flush(sdp, NULL); 876 gfs2_log_flush(sdp, NULL);
884 for (;;) { 877 for (;;) {
885 gfs2_ail1_start(sdp, DIO_ALL); 878 gfs2_ail1_start(sdp);
886 if (gfs2_ail1_empty(sdp, DIO_ALL)) 879 if (gfs2_ail1_empty(sdp, DIO_ALL))
887 break; 880 break;
888 msleep(10); 881 msleep(10);
889 } 882 }
890} 883}
891 884
885static inline int gfs2_jrnl_flush_reqd(struct gfs2_sbd *sdp)
886{
887 return (atomic_read(&sdp->sd_log_pinned) >= atomic_read(&sdp->sd_log_thresh1));
888}
889
890static inline int gfs2_ail_flush_reqd(struct gfs2_sbd *sdp)
891{
892 unsigned int used_blocks = sdp->sd_jdesc->jd_blocks - atomic_read(&sdp->sd_log_blks_free);
893 return used_blocks >= atomic_read(&sdp->sd_log_thresh2);
894}
892 895
893/** 896/**
894 * gfs2_logd - Update log tail as Active Items get flushed to in-place blocks 897 * gfs2_logd - Update log tail as Active Items get flushed to in-place blocks
@@ -901,28 +904,43 @@ void gfs2_meta_syncfs(struct gfs2_sbd *sdp)
901int gfs2_logd(void *data) 904int gfs2_logd(void *data)
902{ 905{
903 struct gfs2_sbd *sdp = data; 906 struct gfs2_sbd *sdp = data;
904 unsigned long t; 907 unsigned long t = 1;
905 int need_flush; 908 DEFINE_WAIT(wait);
909 unsigned preflush;
906 910
907 while (!kthread_should_stop()) { 911 while (!kthread_should_stop()) {
908 /* Advance the log tail */
909 912
910 t = sdp->sd_log_flush_time + 913 preflush = atomic_read(&sdp->sd_log_pinned);
911 gfs2_tune_get(sdp, gt_log_flush_secs) * HZ; 914 if (gfs2_jrnl_flush_reqd(sdp) || t == 0) {
915 gfs2_ail1_empty(sdp, DIO_ALL);
916 gfs2_log_flush(sdp, NULL);
917 gfs2_ail1_empty(sdp, DIO_ALL);
918 }
912 919
913 gfs2_ail1_empty(sdp, DIO_ALL); 920 if (gfs2_ail_flush_reqd(sdp)) {
914 gfs2_log_lock(sdp); 921 gfs2_ail1_start(sdp);
915 need_flush = sdp->sd_log_num_buf > gfs2_tune_get(sdp, gt_incore_log_blocks); 922 io_schedule();
916 gfs2_log_unlock(sdp); 923 gfs2_ail1_empty(sdp, 0);
917 if (need_flush || time_after_eq(jiffies, t)) {
918 gfs2_log_flush(sdp, NULL); 924 gfs2_log_flush(sdp, NULL);
919 sdp->sd_log_flush_time = jiffies; 925 gfs2_ail1_empty(sdp, DIO_ALL);
920 } 926 }
921 927
928 wake_up(&sdp->sd_log_waitq);
922 t = gfs2_tune_get(sdp, gt_logd_secs) * HZ; 929 t = gfs2_tune_get(sdp, gt_logd_secs) * HZ;
923 if (freezing(current)) 930 if (freezing(current))
924 refrigerator(); 931 refrigerator();
925 schedule_timeout_interruptible(t); 932
933 do {
934 prepare_to_wait(&sdp->sd_logd_waitq, &wait,
935 TASK_UNINTERRUPTIBLE);
936 if (!gfs2_ail_flush_reqd(sdp) &&
937 !gfs2_jrnl_flush_reqd(sdp) &&
938 !kthread_should_stop())
939 t = schedule_timeout(t);
940 } while(t && !gfs2_ail_flush_reqd(sdp) &&
941 !gfs2_jrnl_flush_reqd(sdp) &&
942 !kthread_should_stop());
943 finish_wait(&sdp->sd_logd_waitq, &wait);
926 } 944 }
927 945
928 return 0; 946 return 0;
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index 7c64510ccfd2..0d007f920234 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -47,29 +47,21 @@ static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp,
47 sdp->sd_log_head = sdp->sd_log_tail = value; 47 sdp->sd_log_head = sdp->sd_log_tail = value;
48} 48}
49 49
50unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct, 50extern unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
51 unsigned int ssize); 51 unsigned int ssize);
52 52
53int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks); 53extern int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks);
54void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks); 54extern void gfs2_log_incr_head(struct gfs2_sbd *sdp);
55void gfs2_log_incr_head(struct gfs2_sbd *sdp);
56 55
57struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp); 56extern struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp);
58struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp, 57extern struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
59 struct buffer_head *real); 58 struct buffer_head *real);
60void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl); 59extern void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl);
60extern void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
61extern void gfs2_remove_from_ail(struct gfs2_bufdata *bd);
61 62
62static inline void gfs2_log_flush(struct gfs2_sbd *sbd, struct gfs2_glock *gl) 63extern void gfs2_log_shutdown(struct gfs2_sbd *sdp);
63{ 64extern void gfs2_meta_syncfs(struct gfs2_sbd *sdp);
64 if (!gl || test_bit(GLF_LFLUSH, &gl->gl_flags)) 65extern int gfs2_logd(void *data);
65 __gfs2_log_flush(sbd, gl);
66}
67
68void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
69void gfs2_remove_from_ail(struct gfs2_bufdata *bd);
70
71void gfs2_log_shutdown(struct gfs2_sbd *sdp);
72void gfs2_meta_syncfs(struct gfs2_sbd *sdp);
73int gfs2_logd(void *data);
74 66
75#endif /* __LOG_DOT_H__ */ 67#endif /* __LOG_DOT_H__ */
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index adc260fbea90..bf33f822058d 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -54,6 +54,7 @@ static void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
54 if (bd->bd_ail) 54 if (bd->bd_ail)
55 list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list); 55 list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list);
56 get_bh(bh); 56 get_bh(bh);
57 atomic_inc(&sdp->sd_log_pinned);
57 trace_gfs2_pin(bd, 1); 58 trace_gfs2_pin(bd, 1);
58} 59}
59 60
@@ -94,6 +95,7 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
94 trace_gfs2_pin(bd, 0); 95 trace_gfs2_pin(bd, 0);
95 gfs2_log_unlock(sdp); 96 gfs2_log_unlock(sdp);
96 unlock_buffer(bh); 97 unlock_buffer(bh);
98 atomic_dec(&sdp->sd_log_pinned);
97} 99}
98 100
99 101
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index a88fadc704bb..fb2a5f93b7c3 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -94,7 +94,7 @@ static int __init init_gfs2_fs(void)
94 if (!gfs2_glock_cachep) 94 if (!gfs2_glock_cachep)
95 goto fail; 95 goto fail;
96 96
97 gfs2_glock_aspace_cachep = kmem_cache_create("gfs2_glock (aspace)", 97 gfs2_glock_aspace_cachep = kmem_cache_create("gfs2_glock(aspace)",
98 sizeof(struct gfs2_glock) + 98 sizeof(struct gfs2_glock) +
99 sizeof(struct address_space), 99 sizeof(struct address_space),
100 0, 0, gfs2_init_gl_aspace_once); 100 0, 0, gfs2_init_gl_aspace_once);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 0bb12c80937a..18176d0b75d7 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -34,7 +34,6 @@
34 34
35static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wbc) 35static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wbc)
36{ 36{
37 int err;
38 struct buffer_head *bh, *head; 37 struct buffer_head *bh, *head;
39 int nr_underway = 0; 38 int nr_underway = 0;
40 int write_op = (1 << BIO_RW_META) | ((wbc->sync_mode == WB_SYNC_ALL ? 39 int write_op = (1 << BIO_RW_META) | ((wbc->sync_mode == WB_SYNC_ALL ?
@@ -86,11 +85,10 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
86 } while (bh != head); 85 } while (bh != head);
87 unlock_page(page); 86 unlock_page(page);
88 87
89 err = 0;
90 if (nr_underway == 0) 88 if (nr_underway == 0)
91 end_page_writeback(page); 89 end_page_writeback(page);
92 90
93 return err; 91 return 0;
94} 92}
95 93
96const struct address_space_operations gfs2_meta_aops = { 94const struct address_space_operations gfs2_meta_aops = {
@@ -313,6 +311,7 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int
313 struct gfs2_bufdata *bd = bh->b_private; 311 struct gfs2_bufdata *bd = bh->b_private;
314 312
315 if (test_clear_buffer_pinned(bh)) { 313 if (test_clear_buffer_pinned(bh)) {
314 atomic_dec(&sdp->sd_log_pinned);
316 list_del_init(&bd->bd_le.le_list); 315 list_del_init(&bd->bd_le.le_list);
317 if (meta) { 316 if (meta) {
318 gfs2_assert_warn(sdp, sdp->sd_log_num_buf); 317 gfs2_assert_warn(sdp, sdp->sd_log_num_buf);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index c1309ed1c496..3593b3a7290e 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -57,8 +57,6 @@ static void gfs2_tune_init(struct gfs2_tune *gt)
57{ 57{
58 spin_lock_init(&gt->gt_spin); 58 spin_lock_init(&gt->gt_spin);
59 59
60 gt->gt_incore_log_blocks = 1024;
61 gt->gt_logd_secs = 1;
62 gt->gt_quota_simul_sync = 64; 60 gt->gt_quota_simul_sync = 64;
63 gt->gt_quota_warn_period = 10; 61 gt->gt_quota_warn_period = 10;
64 gt->gt_quota_scale_num = 1; 62 gt->gt_quota_scale_num = 1;
@@ -101,14 +99,15 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
101 spin_lock_init(&sdp->sd_trunc_lock); 99 spin_lock_init(&sdp->sd_trunc_lock);
102 100
103 spin_lock_init(&sdp->sd_log_lock); 101 spin_lock_init(&sdp->sd_log_lock);
104 102 atomic_set(&sdp->sd_log_pinned, 0);
105 INIT_LIST_HEAD(&sdp->sd_log_le_buf); 103 INIT_LIST_HEAD(&sdp->sd_log_le_buf);
106 INIT_LIST_HEAD(&sdp->sd_log_le_revoke); 104 INIT_LIST_HEAD(&sdp->sd_log_le_revoke);
107 INIT_LIST_HEAD(&sdp->sd_log_le_rg); 105 INIT_LIST_HEAD(&sdp->sd_log_le_rg);
108 INIT_LIST_HEAD(&sdp->sd_log_le_databuf); 106 INIT_LIST_HEAD(&sdp->sd_log_le_databuf);
109 INIT_LIST_HEAD(&sdp->sd_log_le_ordered); 107 INIT_LIST_HEAD(&sdp->sd_log_le_ordered);
110 108
111 mutex_init(&sdp->sd_log_reserve_mutex); 109 init_waitqueue_head(&sdp->sd_log_waitq);
110 init_waitqueue_head(&sdp->sd_logd_waitq);
112 INIT_LIST_HEAD(&sdp->sd_ail1_list); 111 INIT_LIST_HEAD(&sdp->sd_ail1_list);
113 INIT_LIST_HEAD(&sdp->sd_ail2_list); 112 INIT_LIST_HEAD(&sdp->sd_ail2_list);
114 113
@@ -487,7 +486,7 @@ static int gfs2_lookup_root(struct super_block *sb, struct dentry **dptr,
487 struct dentry *dentry; 486 struct dentry *dentry;
488 struct inode *inode; 487 struct inode *inode;
489 488
490 inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0, 0); 489 inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0);
491 if (IS_ERR(inode)) { 490 if (IS_ERR(inode)) {
492 fs_err(sdp, "can't read in %s inode: %ld\n", name, PTR_ERR(inode)); 491 fs_err(sdp, "can't read in %s inode: %ld\n", name, PTR_ERR(inode));
493 return PTR_ERR(inode); 492 return PTR_ERR(inode);
@@ -733,6 +732,8 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
733 if (sdp->sd_args.ar_spectator) { 732 if (sdp->sd_args.ar_spectator) {
734 sdp->sd_jdesc = gfs2_jdesc_find(sdp, 0); 733 sdp->sd_jdesc = gfs2_jdesc_find(sdp, 0);
735 atomic_set(&sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks); 734 atomic_set(&sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks);
735 atomic_set(&sdp->sd_log_thresh1, 2*sdp->sd_jdesc->jd_blocks/5);
736 atomic_set(&sdp->sd_log_thresh2, 4*sdp->sd_jdesc->jd_blocks/5);
736 } else { 737 } else {
737 if (sdp->sd_lockstruct.ls_jid >= gfs2_jindex_size(sdp)) { 738 if (sdp->sd_lockstruct.ls_jid >= gfs2_jindex_size(sdp)) {
738 fs_err(sdp, "can't mount journal #%u\n", 739 fs_err(sdp, "can't mount journal #%u\n",
@@ -770,6 +771,8 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
770 goto fail_jinode_gh; 771 goto fail_jinode_gh;
771 } 772 }
772 atomic_set(&sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks); 773 atomic_set(&sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks);
774 atomic_set(&sdp->sd_log_thresh1, 2*sdp->sd_jdesc->jd_blocks/5);
775 atomic_set(&sdp->sd_log_thresh2, 4*sdp->sd_jdesc->jd_blocks/5);
773 776
774 /* Map the extents for this journal's blocks */ 777 /* Map the extents for this journal's blocks */
775 map_journal_extents(sdp); 778 map_journal_extents(sdp);
@@ -951,8 +954,6 @@ static int init_threads(struct gfs2_sbd *sdp, int undo)
951 if (undo) 954 if (undo)
952 goto fail_quotad; 955 goto fail_quotad;
953 956
954 sdp->sd_log_flush_time = jiffies;
955
956 p = kthread_run(gfs2_logd, sdp, "gfs2_logd"); 957 p = kthread_run(gfs2_logd, sdp, "gfs2_logd");
957 error = IS_ERR(p); 958 error = IS_ERR(p);
958 if (error) { 959 if (error) {
@@ -1160,7 +1161,7 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
1160 GFS2_BASIC_BLOCK_SHIFT; 1161 GFS2_BASIC_BLOCK_SHIFT;
1161 sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift; 1162 sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
1162 1163
1163 sdp->sd_tune.gt_log_flush_secs = sdp->sd_args.ar_commit; 1164 sdp->sd_tune.gt_logd_secs = sdp->sd_args.ar_commit;
1164 sdp->sd_tune.gt_quota_quantum = sdp->sd_args.ar_quota_quantum; 1165 sdp->sd_tune.gt_quota_quantum = sdp->sd_args.ar_quota_quantum;
1165 if (sdp->sd_args.ar_statfs_quantum) { 1166 if (sdp->sd_args.ar_statfs_quantum) {
1166 sdp->sd_tune.gt_statfs_slow = 0; 1167 sdp->sd_tune.gt_statfs_slow = 0;
@@ -1323,7 +1324,7 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
1323 memset(&args, 0, sizeof(args)); 1324 memset(&args, 0, sizeof(args));
1324 args.ar_quota = GFS2_QUOTA_DEFAULT; 1325 args.ar_quota = GFS2_QUOTA_DEFAULT;
1325 args.ar_data = GFS2_DATA_DEFAULT; 1326 args.ar_data = GFS2_DATA_DEFAULT;
1326 args.ar_commit = 60; 1327 args.ar_commit = 30;
1327 args.ar_statfs_quantum = 30; 1328 args.ar_statfs_quantum = 30;
1328 args.ar_quota_quantum = 60; 1329 args.ar_quota_quantum = 60;
1329 args.ar_errors = GFS2_ERRORS_DEFAULT; 1330 args.ar_errors = GFS2_ERRORS_DEFAULT;
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 4e64352d49de..98cdd05f3316 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -1071,6 +1071,9 @@ int gfs2_permission(struct inode *inode, int mask)
1071 return error; 1071 return error;
1072} 1072}
1073 1073
1074/*
1075 * XXX: should be changed to have proper ordering by opencoding simple_setsize
1076 */
1074static int setattr_size(struct inode *inode, struct iattr *attr) 1077static int setattr_size(struct inode *inode, struct iattr *attr)
1075{ 1078{
1076 struct gfs2_inode *ip = GFS2_I(inode); 1079 struct gfs2_inode *ip = GFS2_I(inode);
@@ -1081,7 +1084,7 @@ static int setattr_size(struct inode *inode, struct iattr *attr)
1081 error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks); 1084 error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
1082 if (error) 1085 if (error)
1083 return error; 1086 return error;
1084 error = vmtruncate(inode, attr->ia_size); 1087 error = simple_setsize(inode, attr->ia_size);
1085 gfs2_trans_end(sdp); 1088 gfs2_trans_end(sdp);
1086 if (error) 1089 if (error)
1087 return error; 1090 return error;
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 6dbcbad6ab17..49667d68769e 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -637,15 +637,40 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
637 unsigned blocksize, iblock, pos; 637 unsigned blocksize, iblock, pos;
638 struct buffer_head *bh, *dibh; 638 struct buffer_head *bh, *dibh;
639 struct page *page; 639 struct page *page;
640 void *kaddr; 640 void *kaddr, *ptr;
641 struct gfs2_quota *qp; 641 struct gfs2_quota q, *qp;
642 s64 value; 642 int err, nbytes;
643 int err = -EIO;
644 u64 size; 643 u64 size;
645 644
646 if (gfs2_is_stuffed(ip)) 645 if (gfs2_is_stuffed(ip))
647 gfs2_unstuff_dinode(ip, NULL); 646 gfs2_unstuff_dinode(ip, NULL);
648 647
648 memset(&q, 0, sizeof(struct gfs2_quota));
649 err = gfs2_internal_read(ip, NULL, (char *)&q, &loc, sizeof(q));
650 if (err < 0)
651 return err;
652
653 err = -EIO;
654 qp = &q;
655 qp->qu_value = be64_to_cpu(qp->qu_value);
656 qp->qu_value += change;
657 qp->qu_value = cpu_to_be64(qp->qu_value);
658 qd->qd_qb.qb_value = qp->qu_value;
659 if (fdq) {
660 if (fdq->d_fieldmask & FS_DQ_BSOFT) {
661 qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit);
662 qd->qd_qb.qb_warn = qp->qu_warn;
663 }
664 if (fdq->d_fieldmask & FS_DQ_BHARD) {
665 qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit);
666 qd->qd_qb.qb_limit = qp->qu_limit;
667 }
668 }
669
670 /* Write the quota into the quota file on disk */
671 ptr = qp;
672 nbytes = sizeof(struct gfs2_quota);
673get_a_page:
649 page = grab_cache_page(mapping, index); 674 page = grab_cache_page(mapping, index);
650 if (!page) 675 if (!page)
651 return -ENOMEM; 676 return -ENOMEM;
@@ -667,7 +692,12 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
667 if (!buffer_mapped(bh)) { 692 if (!buffer_mapped(bh)) {
668 gfs2_block_map(inode, iblock, bh, 1); 693 gfs2_block_map(inode, iblock, bh, 1);
669 if (!buffer_mapped(bh)) 694 if (!buffer_mapped(bh))
670 goto unlock; 695 goto unlock_out;
696 /* If it's a newly allocated disk block for quota, zero it */
697 if (buffer_new(bh)) {
698 memset(bh->b_data, 0, bh->b_size);
699 set_buffer_uptodate(bh);
700 }
671 } 701 }
672 702
673 if (PageUptodate(page)) 703 if (PageUptodate(page))
@@ -677,32 +707,34 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
677 ll_rw_block(READ_META, 1, &bh); 707 ll_rw_block(READ_META, 1, &bh);
678 wait_on_buffer(bh); 708 wait_on_buffer(bh);
679 if (!buffer_uptodate(bh)) 709 if (!buffer_uptodate(bh))
680 goto unlock; 710 goto unlock_out;
681 } 711 }
682 712
683 gfs2_trans_add_bh(ip->i_gl, bh, 0); 713 gfs2_trans_add_bh(ip->i_gl, bh, 0);
684 714
685 kaddr = kmap_atomic(page, KM_USER0); 715 kaddr = kmap_atomic(page, KM_USER0);
686 qp = kaddr + offset; 716 if (offset + sizeof(struct gfs2_quota) > PAGE_CACHE_SIZE)
687 value = (s64)be64_to_cpu(qp->qu_value) + change; 717 nbytes = PAGE_CACHE_SIZE - offset;
688 qp->qu_value = cpu_to_be64(value); 718 memcpy(kaddr + offset, ptr, nbytes);
689 qd->qd_qb.qb_value = qp->qu_value;
690 if (fdq) {
691 if (fdq->d_fieldmask & FS_DQ_BSOFT) {
692 qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit);
693 qd->qd_qb.qb_warn = qp->qu_warn;
694 }
695 if (fdq->d_fieldmask & FS_DQ_BHARD) {
696 qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit);
697 qd->qd_qb.qb_limit = qp->qu_limit;
698 }
699 }
700 flush_dcache_page(page); 719 flush_dcache_page(page);
701 kunmap_atomic(kaddr, KM_USER0); 720 kunmap_atomic(kaddr, KM_USER0);
721 unlock_page(page);
722 page_cache_release(page);
702 723
724 /* If quota straddles page boundary, we need to update the rest of the
725 * quota at the beginning of the next page */
726 if (offset != 0) { /* first page, offset is closer to PAGE_CACHE_SIZE */
727 ptr = ptr + nbytes;
728 nbytes = sizeof(struct gfs2_quota) - nbytes;
729 offset = 0;
730 index++;
731 goto get_a_page;
732 }
733
734 /* Update the disk inode timestamp and size (if extended) */
703 err = gfs2_meta_inode_buffer(ip, &dibh); 735 err = gfs2_meta_inode_buffer(ip, &dibh);
704 if (err) 736 if (err)
705 goto unlock; 737 goto out;
706 738
707 size = loc + sizeof(struct gfs2_quota); 739 size = loc + sizeof(struct gfs2_quota);
708 if (size > inode->i_size) { 740 if (size > inode->i_size) {
@@ -715,7 +747,9 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
715 brelse(dibh); 747 brelse(dibh);
716 mark_inode_dirty(inode); 748 mark_inode_dirty(inode);
717 749
718unlock: 750out:
751 return err;
752unlock_out:
719 unlock_page(page); 753 unlock_page(page);
720 page_cache_release(page); 754 page_cache_release(page);
721 return err; 755 return err;
@@ -779,8 +813,10 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
779 * rgrp since it won't be allocated during the transaction 813 * rgrp since it won't be allocated during the transaction
780 */ 814 */
781 al->al_requested = 1; 815 al->al_requested = 1;
782 /* +1 in the end for block requested above for unstuffing */ 816 /* +3 in the end for unstuffing block, inode size update block
783 blocks = num_qd * data_blocks + RES_DINODE + num_qd + 1; 817 * and another block in case quota straddles page boundary and
818 * two blocks need to be updated instead of 1 */
819 blocks = num_qd * data_blocks + RES_DINODE + num_qd + 3;
784 820
785 if (nalloc) 821 if (nalloc)
786 al->al_requested += nalloc * (data_blocks + ind_blocks); 822 al->al_requested += nalloc * (data_blocks + ind_blocks);
@@ -1418,10 +1454,18 @@ static int gfs2_quota_get_xstate(struct super_block *sb,
1418 1454
1419 memset(fqs, 0, sizeof(struct fs_quota_stat)); 1455 memset(fqs, 0, sizeof(struct fs_quota_stat));
1420 fqs->qs_version = FS_QSTAT_VERSION; 1456 fqs->qs_version = FS_QSTAT_VERSION;
1421 if (sdp->sd_args.ar_quota == GFS2_QUOTA_ON) 1457
1422 fqs->qs_flags = (XFS_QUOTA_UDQ_ENFD | XFS_QUOTA_GDQ_ENFD); 1458 switch (sdp->sd_args.ar_quota) {
1423 else if (sdp->sd_args.ar_quota == GFS2_QUOTA_ACCOUNT) 1459 case GFS2_QUOTA_ON:
1424 fqs->qs_flags = (XFS_QUOTA_UDQ_ACCT | XFS_QUOTA_GDQ_ACCT); 1460 fqs->qs_flags |= (XFS_QUOTA_UDQ_ENFD | XFS_QUOTA_GDQ_ENFD);
1461 /*FALLTHRU*/
1462 case GFS2_QUOTA_ACCOUNT:
1463 fqs->qs_flags |= (XFS_QUOTA_UDQ_ACCT | XFS_QUOTA_GDQ_ACCT);
1464 break;
1465 case GFS2_QUOTA_OFF:
1466 break;
1467 }
1468
1425 if (sdp->sd_quota_inode) { 1469 if (sdp->sd_quota_inode) {
1426 fqs->qs_uquota.qfs_ino = GFS2_I(sdp->sd_quota_inode)->i_no_addr; 1470 fqs->qs_uquota.qfs_ino = GFS2_I(sdp->sd_quota_inode)->i_no_addr;
1427 fqs->qs_uquota.qfs_nblks = sdp->sd_quota_inode->i_blocks; 1471 fqs->qs_uquota.qfs_nblks = sdp->sd_quota_inode->i_blocks;
@@ -1432,8 +1476,8 @@ static int gfs2_quota_get_xstate(struct super_block *sb,
1432 return 0; 1476 return 0;
1433} 1477}
1434 1478
1435static int gfs2_xquota_get(struct super_block *sb, int type, qid_t id, 1479static int gfs2_get_dqblk(struct super_block *sb, int type, qid_t id,
1436 struct fs_disk_quota *fdq) 1480 struct fs_disk_quota *fdq)
1437{ 1481{
1438 struct gfs2_sbd *sdp = sb->s_fs_info; 1482 struct gfs2_sbd *sdp = sb->s_fs_info;
1439 struct gfs2_quota_lvb *qlvb; 1483 struct gfs2_quota_lvb *qlvb;
@@ -1477,8 +1521,8 @@ out:
1477/* GFS2 only supports a subset of the XFS fields */ 1521/* GFS2 only supports a subset of the XFS fields */
1478#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD) 1522#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD)
1479 1523
1480static int gfs2_xquota_set(struct super_block *sb, int type, qid_t id, 1524static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
1481 struct fs_disk_quota *fdq) 1525 struct fs_disk_quota *fdq)
1482{ 1526{
1483 struct gfs2_sbd *sdp = sb->s_fs_info; 1527 struct gfs2_sbd *sdp = sb->s_fs_info;
1484 struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode); 1528 struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
@@ -1585,7 +1629,7 @@ out_put:
1585const struct quotactl_ops gfs2_quotactl_ops = { 1629const struct quotactl_ops gfs2_quotactl_ops = {
1586 .quota_sync = gfs2_quota_sync, 1630 .quota_sync = gfs2_quota_sync,
1587 .get_xstate = gfs2_quota_get_xstate, 1631 .get_xstate = gfs2_quota_get_xstate,
1588 .get_xquota = gfs2_xquota_get, 1632 .get_dqblk = gfs2_get_dqblk,
1589 .set_xquota = gfs2_xquota_set, 1633 .set_dqblk = gfs2_set_dqblk,
1590}; 1634};
1591 1635
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 503b842f3ba2..171a744f8e45 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -854,7 +854,8 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
854 if ((start + nr_sects) != blk) { 854 if ((start + nr_sects) != blk) {
855 rv = blkdev_issue_discard(bdev, start, 855 rv = blkdev_issue_discard(bdev, start,
856 nr_sects, GFP_NOFS, 856 nr_sects, GFP_NOFS,
857 DISCARD_FL_BARRIER); 857 BLKDEV_IFL_WAIT |
858 BLKDEV_IFL_BARRIER);
858 if (rv) 859 if (rv)
859 goto fail; 860 goto fail;
860 nr_sects = 0; 861 nr_sects = 0;
@@ -869,7 +870,7 @@ start_new_extent:
869 } 870 }
870 if (nr_sects) { 871 if (nr_sects) {
871 rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS, 872 rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS,
872 DISCARD_FL_BARRIER); 873 BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
873 if (rv) 874 if (rv)
874 goto fail; 875 goto fail;
875 } 876 }
@@ -948,13 +949,13 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
948 * try_rgrp_unlink - Look for any unlinked, allocated, but unused inodes 949 * try_rgrp_unlink - Look for any unlinked, allocated, but unused inodes
949 * @rgd: The rgrp 950 * @rgd: The rgrp
950 * 951 *
951 * Returns: The inode, if one has been found 952 * Returns: 0 if no error
953 * The inode, if one has been found, in inode.
952 */ 954 */
953 955
954static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, 956static u64 try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked,
955 u64 skip) 957 u64 skip)
956{ 958{
957 struct inode *inode;
958 u32 goal = 0, block; 959 u32 goal = 0, block;
959 u64 no_addr; 960 u64 no_addr;
960 struct gfs2_sbd *sdp = rgd->rd_sbd; 961 struct gfs2_sbd *sdp = rgd->rd_sbd;
@@ -979,14 +980,11 @@ static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked,
979 if (no_addr == skip) 980 if (no_addr == skip)
980 continue; 981 continue;
981 *last_unlinked = no_addr; 982 *last_unlinked = no_addr;
982 inode = gfs2_inode_lookup(rgd->rd_sbd->sd_vfs, DT_UNKNOWN, 983 return no_addr;
983 no_addr, -1, 1);
984 if (!IS_ERR(inode))
985 return inode;
986 } 984 }
987 985
988 rgd->rd_flags &= ~GFS2_RDF_CHECK; 986 rgd->rd_flags &= ~GFS2_RDF_CHECK;
989 return NULL; 987 return 0;
990} 988}
991 989
992/** 990/**
@@ -1067,11 +1065,12 @@ static void forward_rgrp_set(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd)
1067 * Try to acquire rgrp in way which avoids contending with others. 1065 * Try to acquire rgrp in way which avoids contending with others.
1068 * 1066 *
1069 * Returns: errno 1067 * Returns: errno
1068 * unlinked: the block address of an unlinked block to be reclaimed
1070 */ 1069 */
1071 1070
1072static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked) 1071static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked,
1072 u64 *last_unlinked)
1073{ 1073{
1074 struct inode *inode = NULL;
1075 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1074 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1076 struct gfs2_rgrpd *rgd, *begin = NULL; 1075 struct gfs2_rgrpd *rgd, *begin = NULL;
1077 struct gfs2_alloc *al = ip->i_alloc; 1076 struct gfs2_alloc *al = ip->i_alloc;
@@ -1080,6 +1079,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
1080 int loops = 0; 1079 int loops = 0;
1081 int error, rg_locked; 1080 int error, rg_locked;
1082 1081
1082 *unlinked = 0;
1083 rgd = gfs2_blk2rgrpd(sdp, ip->i_goal); 1083 rgd = gfs2_blk2rgrpd(sdp, ip->i_goal);
1084 1084
1085 while (rgd) { 1085 while (rgd) {
@@ -1096,19 +1096,24 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
1096 case 0: 1096 case 0:
1097 if (try_rgrp_fit(rgd, al)) 1097 if (try_rgrp_fit(rgd, al))
1098 goto out; 1098 goto out;
1099 if (rgd->rd_flags & GFS2_RDF_CHECK) 1099 /* If the rg came in already locked, there's no
1100 inode = try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr); 1100 way we can recover from a failed try_rgrp_unlink
1101 because that would require an iput which can only
1102 happen after the rgrp is unlocked. */
1103 if (!rg_locked && rgd->rd_flags & GFS2_RDF_CHECK)
1104 *unlinked = try_rgrp_unlink(rgd, last_unlinked,
1105 ip->i_no_addr);
1101 if (!rg_locked) 1106 if (!rg_locked)
1102 gfs2_glock_dq_uninit(&al->al_rgd_gh); 1107 gfs2_glock_dq_uninit(&al->al_rgd_gh);
1103 if (inode) 1108 if (*unlinked)
1104 return inode; 1109 return -EAGAIN;
1105 /* fall through */ 1110 /* fall through */
1106 case GLR_TRYFAILED: 1111 case GLR_TRYFAILED:
1107 rgd = recent_rgrp_next(rgd); 1112 rgd = recent_rgrp_next(rgd);
1108 break; 1113 break;
1109 1114
1110 default: 1115 default:
1111 return ERR_PTR(error); 1116 return error;
1112 } 1117 }
1113 } 1118 }
1114 1119
@@ -1130,12 +1135,13 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
1130 case 0: 1135 case 0:
1131 if (try_rgrp_fit(rgd, al)) 1136 if (try_rgrp_fit(rgd, al))
1132 goto out; 1137 goto out;
1133 if (rgd->rd_flags & GFS2_RDF_CHECK) 1138 if (!rg_locked && rgd->rd_flags & GFS2_RDF_CHECK)
1134 inode = try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr); 1139 *unlinked = try_rgrp_unlink(rgd, last_unlinked,
1140 ip->i_no_addr);
1135 if (!rg_locked) 1141 if (!rg_locked)
1136 gfs2_glock_dq_uninit(&al->al_rgd_gh); 1142 gfs2_glock_dq_uninit(&al->al_rgd_gh);
1137 if (inode) 1143 if (*unlinked)
1138 return inode; 1144 return -EAGAIN;
1139 break; 1145 break;
1140 1146
1141 case GLR_TRYFAILED: 1147 case GLR_TRYFAILED:
@@ -1143,7 +1149,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
1143 break; 1149 break;
1144 1150
1145 default: 1151 default:
1146 return ERR_PTR(error); 1152 return error;
1147 } 1153 }
1148 1154
1149 rgd = gfs2_rgrpd_get_next(rgd); 1155 rgd = gfs2_rgrpd_get_next(rgd);
@@ -1152,7 +1158,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
1152 1158
1153 if (rgd == begin) { 1159 if (rgd == begin) {
1154 if (++loops >= 3) 1160 if (++loops >= 3)
1155 return ERR_PTR(-ENOSPC); 1161 return -ENOSPC;
1156 if (!skipped) 1162 if (!skipped)
1157 loops++; 1163 loops++;
1158 flags = 0; 1164 flags = 0;
@@ -1172,7 +1178,7 @@ out:
1172 forward_rgrp_set(sdp, rgd); 1178 forward_rgrp_set(sdp, rgd);
1173 } 1179 }
1174 1180
1175 return NULL; 1181 return 0;
1176} 1182}
1177 1183
1178/** 1184/**
@@ -1186,9 +1192,8 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line)
1186{ 1192{
1187 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1193 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1188 struct gfs2_alloc *al = ip->i_alloc; 1194 struct gfs2_alloc *al = ip->i_alloc;
1189 struct inode *inode;
1190 int error = 0; 1195 int error = 0;
1191 u64 last_unlinked = NO_BLOCK; 1196 u64 last_unlinked = NO_BLOCK, unlinked;
1192 1197
1193 if (gfs2_assert_warn(sdp, al->al_requested)) 1198 if (gfs2_assert_warn(sdp, al->al_requested))
1194 return -EINVAL; 1199 return -EINVAL;
@@ -1204,17 +1209,27 @@ try_again:
1204 if (error) 1209 if (error)
1205 return error; 1210 return error;
1206 1211
1207 inode = get_local_rgrp(ip, &last_unlinked); 1212 /* Find an rgrp suitable for allocation. If it encounters any unlinked
1208 if (inode) { 1213 dinodes along the way, error will equal -EAGAIN and unlinked will
1214 contains it block address. We then need to look up that inode and
1215 try to free it, and try the allocation again. */
1216 error = get_local_rgrp(ip, &unlinked, &last_unlinked);
1217 if (error) {
1209 if (ip != GFS2_I(sdp->sd_rindex)) 1218 if (ip != GFS2_I(sdp->sd_rindex))
1210 gfs2_glock_dq_uninit(&al->al_ri_gh); 1219 gfs2_glock_dq_uninit(&al->al_ri_gh);
1211 if (IS_ERR(inode)) 1220 if (error != -EAGAIN)
1212 return PTR_ERR(inode); 1221 return error;
1213 iput(inode); 1222
1223 gfs2_process_unlinked_inode(ip->i_inode.i_sb, unlinked);
1224 /* regardless of whether or not gfs2_process_unlinked_inode
1225 was successful, we don't want to repeat it again. */
1226 last_unlinked = unlinked;
1214 gfs2_log_flush(sdp, NULL); 1227 gfs2_log_flush(sdp, NULL);
1228 error = 0;
1229
1215 goto try_again; 1230 goto try_again;
1216 } 1231 }
1217 1232 /* no error, so we have the rgrp set in the inode's allocation. */
1218 al->al_file = file; 1233 al->al_file = file;
1219 al->al_line = line; 1234 al->al_line = line;
1220 1235
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index b4106ddaaa98..f07119d89557 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -10,6 +10,8 @@
10#ifndef __RGRP_DOT_H__ 10#ifndef __RGRP_DOT_H__
11#define __RGRP_DOT_H__ 11#define __RGRP_DOT_H__
12 12
13#include <linux/slab.h>
14
13struct gfs2_rgrpd; 15struct gfs2_rgrpd;
14struct gfs2_sbd; 16struct gfs2_sbd;
15struct gfs2_holder; 17struct gfs2_holder;
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 50aac606b990..4d1aad38f1b1 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1113,7 +1113,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
1113 int error; 1113 int error;
1114 1114
1115 spin_lock(&gt->gt_spin); 1115 spin_lock(&gt->gt_spin);
1116 args.ar_commit = gt->gt_log_flush_secs; 1116 args.ar_commit = gt->gt_logd_secs;
1117 args.ar_quota_quantum = gt->gt_quota_quantum; 1117 args.ar_quota_quantum = gt->gt_quota_quantum;
1118 if (gt->gt_statfs_slow) 1118 if (gt->gt_statfs_slow)
1119 args.ar_statfs_quantum = 0; 1119 args.ar_statfs_quantum = 0;
@@ -1160,7 +1160,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
1160 else 1160 else
1161 clear_bit(SDF_NOBARRIERS, &sdp->sd_flags); 1161 clear_bit(SDF_NOBARRIERS, &sdp->sd_flags);
1162 spin_lock(&gt->gt_spin); 1162 spin_lock(&gt->gt_spin);
1163 gt->gt_log_flush_secs = args.ar_commit; 1163 gt->gt_logd_secs = args.ar_commit;
1164 gt->gt_quota_quantum = args.ar_quota_quantum; 1164 gt->gt_quota_quantum = args.ar_quota_quantum;
1165 if (args.ar_statfs_quantum) { 1165 if (args.ar_statfs_quantum) {
1166 gt->gt_statfs_slow = 0; 1166 gt->gt_statfs_slow = 0;
@@ -1305,8 +1305,8 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
1305 } 1305 }
1306 if (args->ar_discard) 1306 if (args->ar_discard)
1307 seq_printf(s, ",discard"); 1307 seq_printf(s, ",discard");
1308 val = sdp->sd_tune.gt_log_flush_secs; 1308 val = sdp->sd_tune.gt_logd_secs;
1309 if (val != 60) 1309 if (val != 30)
1310 seq_printf(s, ",commit=%d", val); 1310 seq_printf(s, ",commit=%d", val);
1311 val = sdp->sd_tune.gt_statfs_quantum; 1311 val = sdp->sd_tune.gt_statfs_quantum;
1312 if (val != 30) 1312 if (val != 30)
@@ -1334,7 +1334,8 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
1334 } 1334 }
1335 if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) 1335 if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
1336 seq_printf(s, ",nobarrier"); 1336 seq_printf(s, ",nobarrier");
1337 1337 if (test_bit(SDF_DEMOTE, &sdp->sd_flags))
1338 seq_printf(s, ",demote_interface_used");
1338 return 0; 1339 return 0;
1339} 1340}
1340 1341
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index 3df60f2d84e3..a0464680af0b 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -54,7 +54,7 @@ extern struct file_system_type gfs2meta_fs_type;
54extern const struct export_operations gfs2_export_ops; 54extern const struct export_operations gfs2_export_ops;
55extern const struct super_operations gfs2_super_ops; 55extern const struct super_operations gfs2_super_ops;
56extern const struct dentry_operations gfs2_dops; 56extern const struct dentry_operations gfs2_dops;
57extern struct xattr_handler *gfs2_xattr_handlers[]; 57extern const struct xattr_handler *gfs2_xattr_handlers[];
58 58
59#endif /* __SUPER_DOT_H__ */ 59#endif /* __SUPER_DOT_H__ */
60 60
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 419042f7f0b6..37f5393e68e6 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -8,7 +8,6 @@
8 */ 8 */
9 9
10#include <linux/sched.h> 10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h> 11#include <linux/spinlock.h>
13#include <linux/completion.h> 12#include <linux/completion.h>
14#include <linux/buffer_head.h> 13#include <linux/buffer_head.h>
@@ -233,6 +232,8 @@ static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len
233 glops = gfs2_glops_list[gltype]; 232 glops = gfs2_glops_list[gltype];
234 if (glops == NULL) 233 if (glops == NULL)
235 return -EINVAL; 234 return -EINVAL;
235 if (!test_and_set_bit(SDF_DEMOTE, &sdp->sd_flags))
236 fs_info(sdp, "demote interface used\n");
236 rv = gfs2_glock_get(sdp, glnum, glops, 0, &gl); 237 rv = gfs2_glock_get(sdp, glnum, glops, 0, &gl);
237 if (rv) 238 if (rv)
238 return rv; 239 return rv;
@@ -469,8 +470,6 @@ static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\
469} \ 470} \
470TUNE_ATTR_2(name, name##_store) 471TUNE_ATTR_2(name, name##_store)
471 472
472TUNE_ATTR(incore_log_blocks, 0);
473TUNE_ATTR(log_flush_secs, 0);
474TUNE_ATTR(quota_warn_period, 0); 473TUNE_ATTR(quota_warn_period, 0);
475TUNE_ATTR(quota_quantum, 0); 474TUNE_ATTR(quota_quantum, 0);
476TUNE_ATTR(max_readahead, 0); 475TUNE_ATTR(max_readahead, 0);
@@ -482,8 +481,6 @@ TUNE_ATTR(statfs_quantum, 1);
482TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store); 481TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store);
483 482
484static struct attribute *tune_attrs[] = { 483static struct attribute *tune_attrs[] = {
485 &tune_attr_incore_log_blocks.attr,
486 &tune_attr_log_flush_secs.attr,
487 &tune_attr_quota_warn_period.attr, 484 &tune_attr_quota_warn_period.attr,
488 &tune_attr_quota_quantum.attr, 485 &tune_attr_quota_quantum.attr,
489 &tune_attr_max_readahead.attr, 486 &tune_attr_max_readahead.attr,
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 4ef0e9fa3549..9ec73a854111 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -23,6 +23,7 @@
23#include "meta_io.h" 23#include "meta_io.h"
24#include "trans.h" 24#include "trans.h"
25#include "util.h" 25#include "util.h"
26#include "trace_gfs2.h"
26 27
27int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, 28int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
28 unsigned int revokes) 29 unsigned int revokes)
@@ -75,6 +76,23 @@ fail_holder_uninit:
75 return error; 76 return error;
76} 77}
77 78
79/**
80 * gfs2_log_release - Release a given number of log blocks
81 * @sdp: The GFS2 superblock
82 * @blks: The number of blocks
83 *
84 */
85
86static void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks)
87{
88
89 atomic_add(blks, &sdp->sd_log_blks_free);
90 trace_gfs2_log_blocks(sdp, blks);
91 gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <=
92 sdp->sd_jdesc->jd_blocks);
93 up_read(&sdp->sd_log_flush_lock);
94}
95
78void gfs2_trans_end(struct gfs2_sbd *sdp) 96void gfs2_trans_end(struct gfs2_sbd *sdp)
79{ 97{
80 struct gfs2_trans *tr = current->journal_info; 98 struct gfs2_trans *tr = current->journal_info;
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index 226f2bfbf16a..53511291fe36 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -7,7 +7,6 @@
7 * of the GNU General Public License version 2. 7 * of the GNU General Public License version 2.
8 */ 8 */
9 9
10#include <linux/slab.h>
11#include <linux/spinlock.h> 10#include <linux/spinlock.h>
12#include <linux/completion.h> 11#include <linux/completion.h>
13#include <linux/buffer_head.h> 12#include <linux/buffer_head.h>
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index c2ebdf2c01d4..82f93da00d1b 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -1535,21 +1535,21 @@ out_alloc:
1535 return error; 1535 return error;
1536} 1536}
1537 1537
1538static struct xattr_handler gfs2_xattr_user_handler = { 1538static const struct xattr_handler gfs2_xattr_user_handler = {
1539 .prefix = XATTR_USER_PREFIX, 1539 .prefix = XATTR_USER_PREFIX,
1540 .flags = GFS2_EATYPE_USR, 1540 .flags = GFS2_EATYPE_USR,
1541 .get = gfs2_xattr_get, 1541 .get = gfs2_xattr_get,
1542 .set = gfs2_xattr_set, 1542 .set = gfs2_xattr_set,
1543}; 1543};
1544 1544
1545static struct xattr_handler gfs2_xattr_security_handler = { 1545static const struct xattr_handler gfs2_xattr_security_handler = {
1546 .prefix = XATTR_SECURITY_PREFIX, 1546 .prefix = XATTR_SECURITY_PREFIX,
1547 .flags = GFS2_EATYPE_SECURITY, 1547 .flags = GFS2_EATYPE_SECURITY,
1548 .get = gfs2_xattr_get, 1548 .get = gfs2_xattr_get,
1549 .set = gfs2_xattr_set, 1549 .set = gfs2_xattr_set,
1550}; 1550};
1551 1551
1552struct xattr_handler *gfs2_xattr_handlers[] = { 1552const struct xattr_handler *gfs2_xattr_handlers[] = {
1553 &gfs2_xattr_user_handler, 1553 &gfs2_xattr_user_handler,
1554 &gfs2_xattr_security_handler, 1554 &gfs2_xattr_security_handler,
1555 &gfs2_xattr_system_handler, 1555 &gfs2_xattr_system_handler,
diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c
index 0d200068d0af..cdb41a1f6a64 100644
--- a/fs/hfs/bnode.c
+++ b/fs/hfs/bnode.c
@@ -9,6 +9,7 @@
9 */ 9 */
10 10
11#include <linux/pagemap.h> 11#include <linux/pagemap.h>
12#include <linux/slab.h>
12#include <linux/swap.h> 13#include <linux/swap.h>
13 14
14#include "btree.h" 15#include "btree.h"
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index 052f214ea6f0..38a0a9917d7f 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -9,6 +9,7 @@
9 */ 9 */
10 10
11#include <linux/pagemap.h> 11#include <linux/pagemap.h>
12#include <linux/slab.h>
12#include <linux/log2.h> 13#include <linux/log2.h>
13 14
14#include "btree.h" 15#include "btree.h"
diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c
index 8bbe03c3f6d5..86428f5ac991 100644
--- a/fs/hfs/mdb.c
+++ b/fs/hfs/mdb.c
@@ -11,6 +11,7 @@
11#include <linux/cdrom.h> 11#include <linux/cdrom.h>
12#include <linux/genhd.h> 12#include <linux/genhd.h>
13#include <linux/nls.h> 13#include <linux/nls.h>
14#include <linux/slab.h>
14 15
15#include "hfs_fs.h" 16#include "hfs_fs.h"
16#include "btree.h" 17#include "btree.h"
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 5ed7252b7b23..0a81eb7111f3 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -19,6 +19,7 @@
19#include <linux/nls.h> 19#include <linux/nls.h>
20#include <linux/parser.h> 20#include <linux/parser.h>
21#include <linux/seq_file.h> 21#include <linux/seq_file.h>
22#include <linux/slab.h>
22#include <linux/smp_lock.h> 23#include <linux/smp_lock.h>
23#include <linux/vfs.h> 24#include <linux/vfs.h>
24 25
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 5f4023678251..764fd1bdca88 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -494,7 +494,7 @@ const struct inode_operations hfsplus_dir_inode_operations = {
494const struct file_operations hfsplus_dir_operations = { 494const struct file_operations hfsplus_dir_operations = {
495 .read = generic_read_dir, 495 .read = generic_read_dir,
496 .readdir = hfsplus_readdir, 496 .readdir = hfsplus_readdir,
497 .ioctl = hfsplus_ioctl, 497 .unlocked_ioctl = hfsplus_ioctl,
498 .llseek = generic_file_llseek, 498 .llseek = generic_file_llseek,
499 .release = hfsplus_dir_release, 499 .release = hfsplus_dir_release,
500}; 500};
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 5c10d803d9df..6505c30ad965 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -337,8 +337,7 @@ struct inode *hfsplus_new_inode(struct super_block *, int);
337void hfsplus_delete_inode(struct inode *); 337void hfsplus_delete_inode(struct inode *);
338 338
339/* ioctl.c */ 339/* ioctl.c */
340int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, 340long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
341 unsigned long arg);
342int hfsplus_setxattr(struct dentry *dentry, const char *name, 341int hfsplus_setxattr(struct dentry *dentry, const char *name,
343 const void *value, size_t size, int flags); 342 const void *value, size_t size, int flags);
344ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name, 343ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 1bcf597c0562..9bbb82924a22 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -285,7 +285,7 @@ static const struct file_operations hfsplus_file_operations = {
285 .fsync = file_fsync, 285 .fsync = file_fsync,
286 .open = hfsplus_file_open, 286 .open = hfsplus_file_open,
287 .release = hfsplus_file_release, 287 .release = hfsplus_file_release,
288 .ioctl = hfsplus_ioctl, 288 .unlocked_ioctl = hfsplus_ioctl,
289}; 289};
290 290
291struct inode *hfsplus_new_inode(struct super_block *sb, int mode) 291struct inode *hfsplus_new_inode(struct super_block *sb, int mode)
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index f457d2ca51ab..ac405f099026 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -17,14 +17,16 @@
17#include <linux/mount.h> 17#include <linux/mount.h>
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/xattr.h> 19#include <linux/xattr.h>
20#include <linux/smp_lock.h>
20#include <asm/uaccess.h> 21#include <asm/uaccess.h>
21#include "hfsplus_fs.h" 22#include "hfsplus_fs.h"
22 23
23int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, 24long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
24 unsigned long arg)
25{ 25{
26 struct inode *inode = filp->f_path.dentry->d_inode;
26 unsigned int flags; 27 unsigned int flags;
27 28
29 lock_kernel();
28 switch (cmd) { 30 switch (cmd) {
29 case HFSPLUS_IOC_EXT2_GETFLAGS: 31 case HFSPLUS_IOC_EXT2_GETFLAGS:
30 flags = 0; 32 flags = 0;
@@ -38,8 +40,10 @@ int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
38 case HFSPLUS_IOC_EXT2_SETFLAGS: { 40 case HFSPLUS_IOC_EXT2_SETFLAGS: {
39 int err = 0; 41 int err = 0;
40 err = mnt_want_write(filp->f_path.mnt); 42 err = mnt_want_write(filp->f_path.mnt);
41 if (err) 43 if (err) {
44 unlock_kernel();
42 return err; 45 return err;
46 }
43 47
44 if (!is_owner_or_cap(inode)) { 48 if (!is_owner_or_cap(inode)) {
45 err = -EACCES; 49 err = -EACCES;
@@ -85,9 +89,11 @@ int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
85 mark_inode_dirty(inode); 89 mark_inode_dirty(inode);
86setflags_out: 90setflags_out:
87 mnt_drop_write(filp->f_path.mnt); 91 mnt_drop_write(filp->f_path.mnt);
92 unlock_kernel();
88 return err; 93 return err;
89 } 94 }
90 default: 95 default:
96 unlock_kernel();
91 return -ENOTTY; 97 return -ENOTTY;
92 } 98 }
93} 99}
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index 3fcbb0e1f6fc..572628b4b07d 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -15,6 +15,7 @@
15#include <linux/nls.h> 15#include <linux/nls.h>
16#include <linux/mount.h> 16#include <linux/mount.h>
17#include <linux/seq_file.h> 17#include <linux/seq_file.h>
18#include <linux/slab.h>
18#include "hfsplus_fs.h" 19#include "hfsplus_fs.h"
19 20
20enum { 21enum {
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 032604e5ef2c..87ac1891a185 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -11,6 +11,7 @@
11#include <linux/mm.h> 11#include <linux/mm.h>
12#include <linux/pagemap.h> 12#include <linux/pagemap.h>
13#include <linux/statfs.h> 13#include <linux/statfs.h>
14#include <linux/slab.h>
14#include <linux/seq_file.h> 15#include <linux/seq_file.h>
15#include <linux/mount.h> 16#include <linux/mount.h>
16#include "hostfs.h" 17#include "hostfs.h"
@@ -410,9 +411,9 @@ int hostfs_file_open(struct inode *ino, struct file *file)
410 return 0; 411 return 0;
411} 412}
412 413
413int hostfs_fsync(struct file *file, struct dentry *dentry, int datasync) 414int hostfs_fsync(struct file *file, int datasync)
414{ 415{
415 return fsync_file(HOSTFS_I(dentry->d_inode)->fd, datasync); 416 return fsync_file(HOSTFS_I(file->f_mapping->host)->fd, datasync);
416} 417}
417 418
418static const struct file_operations hostfs_file_fops = { 419static const struct file_operations hostfs_file_fops = {
diff --git a/fs/hpfs/buffer.c b/fs/hpfs/buffer.c
index b6fca543544c..eac5f96323e3 100644
--- a/fs/hpfs/buffer.c
+++ b/fs/hpfs/buffer.c
@@ -6,6 +6,7 @@
6 * general buffer i/o 6 * general buffer i/o
7 */ 7 */
8#include <linux/sched.h> 8#include <linux/sched.h>
9#include <linux/slab.h>
9#include "hpfs_fn.h" 10#include "hpfs_fn.h"
10 11
11void hpfs_lock_creation(struct super_block *s) 12void hpfs_lock_creation(struct super_block *s)
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index 26e3964a4b8c..2338130cceba 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -7,6 +7,7 @@
7 */ 7 */
8 8
9#include <linux/smp_lock.h> 9#include <linux/smp_lock.h>
10#include <linux/slab.h>
10#include "hpfs_fn.h" 11#include "hpfs_fn.h"
11 12
12static int hpfs_dir_release(struct inode *inode, struct file *filp) 13static int hpfs_dir_release(struct inode *inode, struct file *filp)
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index 3efabff00367..a9ae9bfa752f 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -19,9 +19,9 @@ static int hpfs_file_release(struct inode *inode, struct file *file)
19 return 0; 19 return 0;
20} 20}
21 21
22int hpfs_file_fsync(struct file *file, struct dentry *dentry, int datasync) 22int hpfs_file_fsync(struct file *file, int datasync)
23{ 23{
24 /*return file_fsync(file, dentry);*/ 24 /*return file_fsync(file, datasync);*/
25 return 0; /* Don't fsync :-) */ 25 return 0; /* Don't fsync :-) */
26} 26}
27 27
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 97bf738cd5d6..75f9d4324851 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -268,7 +268,7 @@ void hpfs_set_ea(struct inode *, struct fnode *, const char *,
268 268
269/* file.c */ 269/* file.c */
270 270
271int hpfs_file_fsync(struct file *, struct dentry *, int); 271int hpfs_file_fsync(struct file *, int);
272extern const struct file_operations hpfs_file_ops; 272extern const struct file_operations hpfs_file_ops;
273extern const struct inode_operations hpfs_file_iops; 273extern const struct inode_operations hpfs_file_iops;
274extern const struct address_space_operations hpfs_aops; 274extern const struct address_space_operations hpfs_aops;
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index ff90affb94e1..1042a9bc97f3 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -7,6 +7,7 @@
7 */ 7 */
8 8
9#include <linux/smp_lock.h> 9#include <linux/smp_lock.h>
10#include <linux/slab.h>
10#include "hpfs_fn.h" 11#include "hpfs_fn.h"
11 12
12void hpfs_init_inode(struct inode *i) 13void hpfs_init_inode(struct inode *i)
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index cadc4ce48656..aa53842c599c 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -15,6 +15,7 @@
15#include <linux/sched.h> 15#include <linux/sched.h>
16#include <linux/smp_lock.h> 16#include <linux/smp_lock.h>
17#include <linux/bitmap.h> 17#include <linux/bitmap.h>
18#include <linux/slab.h>
18 19
19/* Mark the filesystem dirty, so that chkdsk checks it when os/2 booted */ 20/* Mark the filesystem dirty, so that chkdsk checks it when os/2 booted */
20 21
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index 2e4dfa8593da..826c3f9d29ac 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -587,7 +587,7 @@ static int hppfs_readdir(struct file *file, void *ent, filldir_t filldir)
587 return err; 587 return err;
588} 588}
589 589
590static int hppfs_fsync(struct file *file, struct dentry *dentry, int datasync) 590static int hppfs_fsync(struct file *file, int datasync)
591{ 591{
592 return 0; 592 return 0;
593} 593}
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index a0bbd3d1b41a..a4e9a7ec3691 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -688,7 +688,7 @@ static void init_once(void *foo)
688const struct file_operations hugetlbfs_file_operations = { 688const struct file_operations hugetlbfs_file_operations = {
689 .read = hugetlbfs_read, 689 .read = hugetlbfs_read,
690 .mmap = hugetlbfs_file_mmap, 690 .mmap = hugetlbfs_file_mmap,
691 .fsync = simple_sync_file, 691 .fsync = noop_fsync,
692 .get_unmapped_area = hugetlb_get_unmapped_area, 692 .get_unmapped_area = hugetlb_get_unmapped_area,
693}; 693};
694 694
diff --git a/fs/inode.c b/fs/inode.c
index 407bf392e20a..2bee20ae3d65 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -286,11 +286,9 @@ static void init_once(void *foo)
286 */ 286 */
287void __iget(struct inode *inode) 287void __iget(struct inode *inode)
288{ 288{
289 if (atomic_read(&inode->i_count)) { 289 if (atomic_inc_return(&inode->i_count) != 1)
290 atomic_inc(&inode->i_count);
291 return; 290 return;
292 } 291
293 atomic_inc(&inode->i_count);
294 if (!(inode->i_state & (I_DIRTY|I_SYNC))) 292 if (!(inode->i_state & (I_DIRTY|I_SYNC)))
295 list_move(&inode->i_list, &inode_in_use); 293 list_move(&inode->i_list, &inode_in_use);
296 inodes_stat.nr_unused--; 294 inodes_stat.nr_unused--;
@@ -1205,8 +1203,6 @@ void generic_delete_inode(struct inode *inode)
1205 inodes_stat.nr_inodes--; 1203 inodes_stat.nr_inodes--;
1206 spin_unlock(&inode_lock); 1204 spin_unlock(&inode_lock);
1207 1205
1208 security_inode_delete(inode);
1209
1210 if (op->delete_inode) { 1206 if (op->delete_inode) {
1211 void (*delete)(struct inode *) = op->delete_inode; 1207 void (*delete)(struct inode *) = op->delete_inode;
1212 /* Filesystems implementing their own 1208 /* Filesystems implementing their own
@@ -1610,3 +1606,23 @@ void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
1610 inode->i_ino); 1606 inode->i_ino);
1611} 1607}
1612EXPORT_SYMBOL(init_special_inode); 1608EXPORT_SYMBOL(init_special_inode);
1609
1610/**
1611 * Init uid,gid,mode for new inode according to posix standards
1612 * @inode: New inode
1613 * @dir: Directory inode
1614 * @mode: mode of the new inode
1615 */
1616void inode_init_owner(struct inode *inode, const struct inode *dir,
1617 mode_t mode)
1618{
1619 inode->i_uid = current_fsuid();
1620 if (dir && dir->i_mode & S_ISGID) {
1621 inode->i_gid = dir->i_gid;
1622 if (S_ISDIR(mode))
1623 mode |= S_ISGID;
1624 } else
1625 inode->i_gid = current_fsgid();
1626 inode->i_mode = mode;
1627}
1628EXPORT_SYMBOL(inode_init_owner);
diff --git a/fs/internal.h b/fs/internal.h
index 8a03a5447bdf..6b706bc60a66 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -87,6 +87,8 @@ extern struct file *get_empty_filp(void);
87 * super.c 87 * super.c
88 */ 88 */
89extern int do_remount_sb(struct super_block *, int, void *, int); 89extern int do_remount_sb(struct super_block *, int, void *, int);
90extern void __put_super(struct super_block *sb);
91extern void put_super(struct super_block *sb);
90 92
91/* 93/*
92 * open.c 94 * open.c
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 6c751106c2e5..2d140a713861 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -228,14 +228,23 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
228 228
229#ifdef CONFIG_BLOCK 229#ifdef CONFIG_BLOCK
230 230
231#define blk_to_logical(inode, blk) (blk << (inode)->i_blkbits) 231static inline sector_t logical_to_blk(struct inode *inode, loff_t offset)
232#define logical_to_blk(inode, offset) (offset >> (inode)->i_blkbits); 232{
233 return (offset >> inode->i_blkbits);
234}
235
236static inline loff_t blk_to_logical(struct inode *inode, sector_t blk)
237{
238 return (blk << inode->i_blkbits);
239}
233 240
234/** 241/**
235 * __generic_block_fiemap - FIEMAP for block based inodes (no locking) 242 * __generic_block_fiemap - FIEMAP for block based inodes (no locking)
236 * @inode - the inode to map 243 * @inode: the inode to map
237 * @arg - the pointer to userspace where we copy everything to 244 * @fieinfo: the fiemap info struct that will be passed back to userspace
238 * @get_block - the fs's get_block function 245 * @start: where to start mapping in the inode
246 * @len: how much space to map
247 * @get_block: the fs's get_block function
239 * 248 *
240 * This does FIEMAP for block based inodes. Basically it will just loop 249 * This does FIEMAP for block based inodes. Basically it will just loop
241 * through get_block until we hit the number of extents we want to map, or we 250 * through get_block until we hit the number of extents we want to map, or we
@@ -250,58 +259,63 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
250 */ 259 */
251 260
252int __generic_block_fiemap(struct inode *inode, 261int __generic_block_fiemap(struct inode *inode,
253 struct fiemap_extent_info *fieinfo, u64 start, 262 struct fiemap_extent_info *fieinfo, loff_t start,
254 u64 len, get_block_t *get_block) 263 loff_t len, get_block_t *get_block)
255{ 264{
256 struct buffer_head tmp; 265 struct buffer_head map_bh;
257 unsigned long long start_blk; 266 sector_t start_blk, last_blk;
258 long long length = 0, map_len = 0; 267 loff_t isize = i_size_read(inode);
259 u64 logical = 0, phys = 0, size = 0; 268 u64 logical = 0, phys = 0, size = 0;
260 u32 flags = FIEMAP_EXTENT_MERGED; 269 u32 flags = FIEMAP_EXTENT_MERGED;
261 int ret = 0, past_eof = 0, whole_file = 0; 270 bool past_eof = false, whole_file = false;
271 int ret = 0;
262 272
263 if ((ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC))) 273 ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC);
274 if (ret)
264 return ret; 275 return ret;
265 276
266 start_blk = logical_to_blk(inode, start); 277 /*
267 278 * Either the i_mutex or other appropriate locking needs to be held
268 length = (long long)min_t(u64, len, i_size_read(inode)); 279 * since we expect isize to not change at all through the duration of
269 if (length < len) 280 * this call.
270 whole_file = 1; 281 */
282 if (len >= isize) {
283 whole_file = true;
284 len = isize;
285 }
271 286
272 map_len = length; 287 start_blk = logical_to_blk(inode, start);
288 last_blk = logical_to_blk(inode, start + len - 1);
273 289
274 do { 290 do {
275 /* 291 /*
276 * we set b_size to the total size we want so it will map as 292 * we set b_size to the total size we want so it will map as
277 * many contiguous blocks as possible at once 293 * many contiguous blocks as possible at once
278 */ 294 */
279 memset(&tmp, 0, sizeof(struct buffer_head)); 295 memset(&map_bh, 0, sizeof(struct buffer_head));
280 tmp.b_size = map_len; 296 map_bh.b_size = len;
281 297
282 ret = get_block(inode, start_blk, &tmp, 0); 298 ret = get_block(inode, start_blk, &map_bh, 0);
283 if (ret) 299 if (ret)
284 break; 300 break;
285 301
286 /* HOLE */ 302 /* HOLE */
287 if (!buffer_mapped(&tmp)) { 303 if (!buffer_mapped(&map_bh)) {
288 length -= blk_to_logical(inode, 1);
289 start_blk++; 304 start_blk++;
290 305
291 /* 306 /*
292 * we want to handle the case where there is an 307 * We want to handle the case where there is an
293 * allocated block at the front of the file, and then 308 * allocated block at the front of the file, and then
294 * nothing but holes up to the end of the file properly, 309 * nothing but holes up to the end of the file properly,
295 * to make sure that extent at the front gets properly 310 * to make sure that extent at the front gets properly
296 * marked with FIEMAP_EXTENT_LAST 311 * marked with FIEMAP_EXTENT_LAST
297 */ 312 */
298 if (!past_eof && 313 if (!past_eof &&
299 blk_to_logical(inode, start_blk) >= 314 blk_to_logical(inode, start_blk) >= isize)
300 blk_to_logical(inode, 0)+i_size_read(inode))
301 past_eof = 1; 315 past_eof = 1;
302 316
303 /* 317 /*
304 * first hole after going past the EOF, this is our 318 * First hole after going past the EOF, this is our
305 * last extent 319 * last extent
306 */ 320 */
307 if (past_eof && size) { 321 if (past_eof && size) {
@@ -309,15 +323,18 @@ int __generic_block_fiemap(struct inode *inode,
309 ret = fiemap_fill_next_extent(fieinfo, logical, 323 ret = fiemap_fill_next_extent(fieinfo, logical,
310 phys, size, 324 phys, size,
311 flags); 325 flags);
312 break; 326 } else if (size) {
327 ret = fiemap_fill_next_extent(fieinfo, logical,
328 phys, size, flags);
329 size = 0;
313 } 330 }
314 331
315 /* if we have holes up to/past EOF then we're done */ 332 /* if we have holes up to/past EOF then we're done */
316 if (length <= 0 || past_eof) 333 if (start_blk > last_blk || past_eof || ret)
317 break; 334 break;
318 } else { 335 } else {
319 /* 336 /*
320 * we have gone over the length of what we wanted to 337 * We have gone over the length of what we wanted to
321 * map, and it wasn't the entire file, so add the extent 338 * map, and it wasn't the entire file, so add the extent
322 * we got last time and exit. 339 * we got last time and exit.
323 * 340 *
@@ -331,7 +348,7 @@ int __generic_block_fiemap(struct inode *inode,
331 * are good to go, just add the extent to the fieinfo 348 * are good to go, just add the extent to the fieinfo
332 * and break 349 * and break
333 */ 350 */
334 if (length <= 0 && !whole_file) { 351 if (start_blk > last_blk && !whole_file) {
335 ret = fiemap_fill_next_extent(fieinfo, logical, 352 ret = fiemap_fill_next_extent(fieinfo, logical,
336 phys, size, 353 phys, size,
337 flags); 354 flags);
@@ -351,11 +368,10 @@ int __generic_block_fiemap(struct inode *inode,
351 } 368 }
352 369
353 logical = blk_to_logical(inode, start_blk); 370 logical = blk_to_logical(inode, start_blk);
354 phys = blk_to_logical(inode, tmp.b_blocknr); 371 phys = blk_to_logical(inode, map_bh.b_blocknr);
355 size = tmp.b_size; 372 size = map_bh.b_size;
356 flags = FIEMAP_EXTENT_MERGED; 373 flags = FIEMAP_EXTENT_MERGED;
357 374
358 length -= tmp.b_size;
359 start_blk += logical_to_blk(inode, size); 375 start_blk += logical_to_blk(inode, size);
360 376
361 /* 377 /*
@@ -363,15 +379,13 @@ int __generic_block_fiemap(struct inode *inode,
363 * soon as we find a hole that the last extent we found 379 * soon as we find a hole that the last extent we found
364 * is marked with FIEMAP_EXTENT_LAST 380 * is marked with FIEMAP_EXTENT_LAST
365 */ 381 */
366 if (!past_eof && 382 if (!past_eof && logical + size >= isize)
367 logical+size >= 383 past_eof = true;
368 blk_to_logical(inode, 0)+i_size_read(inode))
369 past_eof = 1;
370 } 384 }
371 cond_resched(); 385 cond_resched();
372 } while (1); 386 } while (1);
373 387
374 /* if ret is 1 then we just hit the end of the extent array */ 388 /* If ret is 1 then we just hit the end of the extent array */
375 if (ret == 1) 389 if (ret == 1)
376 ret = 0; 390 ret = 0;
377 391
@@ -511,15 +525,8 @@ static int ioctl_fsfreeze(struct file *filp)
511 if (sb->s_op->freeze_fs == NULL) 525 if (sb->s_op->freeze_fs == NULL)
512 return -EOPNOTSUPP; 526 return -EOPNOTSUPP;
513 527
514 /* If a blockdevice-backed filesystem isn't specified, return. */
515 if (sb->s_bdev == NULL)
516 return -EINVAL;
517
518 /* Freeze */ 528 /* Freeze */
519 sb = freeze_bdev(sb->s_bdev); 529 return freeze_super(sb);
520 if (IS_ERR(sb))
521 return PTR_ERR(sb);
522 return 0;
523} 530}
524 531
525static int ioctl_fsthaw(struct file *filp) 532static int ioctl_fsthaw(struct file *filp)
@@ -529,12 +536,8 @@ static int ioctl_fsthaw(struct file *filp)
529 if (!capable(CAP_SYS_ADMIN)) 536 if (!capable(CAP_SYS_ADMIN))
530 return -EPERM; 537 return -EPERM;
531 538
532 /* If a blockdevice-backed filesystem isn't specified, return EINVAL. */
533 if (sb->s_bdev == NULL)
534 return -EINVAL;
535
536 /* Thaw */ 539 /* Thaw */
537 return thaw_bdev(sb->s_bdev, sb); 540 return thaw_super(sb);
538} 541}
539 542
540/* 543/*
diff --git a/fs/ioprio.c b/fs/ioprio.c
index c7c0b28d7d21..748cfb92dcc6 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -19,6 +19,7 @@
19 * See also Documentation/block/ioprio.txt 19 * See also Documentation/block/ioprio.txt
20 * 20 *
21 */ 21 */
22#include <linux/gfp.h>
22#include <linux/kernel.h> 23#include <linux/kernel.h>
23#include <linux/ioprio.h> 24#include <linux/ioprio.h>
24#include <linux/blkdev.h> 25#include <linux/blkdev.h>
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index 8ba5441063be..e0aca9a0ac68 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -11,6 +11,7 @@
11 * isofs directory handling functions 11 * isofs directory handling functions
12 */ 12 */
13#include <linux/smp_lock.h> 13#include <linux/smp_lock.h>
14#include <linux/gfp.h>
14#include "isofs.h" 15#include "isofs.h"
15 16
16int isofs_name_translate(struct iso_directory_record *de, char *new, struct inode *inode) 17int isofs_name_translate(struct iso_directory_record *de, char *new, struct inode *inode)
@@ -271,6 +272,7 @@ static int isofs_readdir(struct file *filp,
271 272
272const struct file_operations isofs_dir_operations = 273const struct file_operations isofs_dir_operations =
273{ 274{
275 .llseek = generic_file_llseek,
274 .read = generic_read_dir, 276 .read = generic_read_dir,
275 .readdir = isofs_readdir, 277 .readdir = isofs_readdir,
276}; 278};
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index eaa831311c9c..ab438beb867c 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -7,6 +7,7 @@
7 */ 7 */
8 8
9#include <linux/smp_lock.h> 9#include <linux/smp_lock.h>
10#include <linux/gfp.h>
10#include "isofs.h" 11#include "isofs.h"
11 12
12/* 13/*
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 2c90e3ef625f..28a9ddaa0c49 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -17,7 +17,6 @@
17#include <linux/fs.h> 17#include <linux/fs.h>
18#include <linux/jbd.h> 18#include <linux/jbd.h>
19#include <linux/errno.h> 19#include <linux/errno.h>
20#include <linux/slab.h>
21#include <linux/mm.h> 20#include <linux/mm.h>
22#include <linux/pagemap.h> 21#include <linux/pagemap.h>
23#include <linux/bio.h> 22#include <linux/bio.h>
@@ -787,6 +786,12 @@ wait_for_iobuf:
787 786
788 jbd_debug(3, "JBD: commit phase 6\n"); 787 jbd_debug(3, "JBD: commit phase 6\n");
789 788
789 /* All metadata is written, now write commit record and do cleanup */
790 spin_lock(&journal->j_state_lock);
791 J_ASSERT(commit_transaction->t_state == T_COMMIT);
792 commit_transaction->t_state = T_COMMIT_RECORD;
793 spin_unlock(&journal->j_state_lock);
794
790 if (journal_write_commit_record(journal, commit_transaction)) 795 if (journal_write_commit_record(journal, commit_transaction))
791 err = -EIO; 796 err = -EIO;
792 797
@@ -924,7 +929,7 @@ restart_loop:
924 929
925 jbd_debug(3, "JBD: commit phase 8\n"); 930 jbd_debug(3, "JBD: commit phase 8\n");
926 931
927 J_ASSERT(commit_transaction->t_state == T_COMMIT); 932 J_ASSERT(commit_transaction->t_state == T_COMMIT_RECORD);
928 933
929 commit_transaction->t_state = T_FINISHED; 934 commit_transaction->t_state = T_FINISHED;
930 J_ASSERT(commit_transaction == journal->j_committing_transaction); 935 J_ASSERT(commit_transaction == journal->j_committing_transaction);
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index bd224eec9b07..93d1e47647bd 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -565,6 +565,38 @@ int log_wait_commit(journal_t *journal, tid_t tid)
565} 565}
566 566
567/* 567/*
568 * Return 1 if a given transaction has not yet sent barrier request
569 * connected with a transaction commit. If 0 is returned, transaction
570 * may or may not have sent the barrier. Used to avoid sending barrier
571 * twice in common cases.
572 */
573int journal_trans_will_send_data_barrier(journal_t *journal, tid_t tid)
574{
575 int ret = 0;
576 transaction_t *commit_trans;
577
578 if (!(journal->j_flags & JFS_BARRIER))
579 return 0;
580 spin_lock(&journal->j_state_lock);
581 /* Transaction already committed? */
582 if (tid_geq(journal->j_commit_sequence, tid))
583 goto out;
584 /*
585 * Transaction is being committed and we already proceeded to
586 * writing commit record?
587 */
588 commit_trans = journal->j_committing_transaction;
589 if (commit_trans && commit_trans->t_tid == tid &&
590 commit_trans->t_state >= T_COMMIT_RECORD)
591 goto out;
592 ret = 1;
593out:
594 spin_unlock(&journal->j_state_lock);
595 return ret;
596}
597EXPORT_SYMBOL(journal_trans_will_send_data_barrier);
598
599/*
568 * Log buffer allocation routines: 600 * Log buffer allocation routines:
569 */ 601 */
570 602
@@ -1157,6 +1189,7 @@ int journal_destroy(journal_t *journal)
1157{ 1189{
1158 int err = 0; 1190 int err = 0;
1159 1191
1192
1160 /* Wait for the commit thread to wake up and die. */ 1193 /* Wait for the commit thread to wake up and die. */
1161 journal_kill_thread(journal); 1194 journal_kill_thread(journal);
1162 1195
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
index cb1a49ae605e..54c9bc9e1b17 100644
--- a/fs/jbd/recovery.c
+++ b/fs/jbd/recovery.c
@@ -20,7 +20,6 @@
20#include <linux/fs.h> 20#include <linux/fs.h>
21#include <linux/jbd.h> 21#include <linux/jbd.h>
22#include <linux/errno.h> 22#include <linux/errno.h>
23#include <linux/slab.h>
24#endif 23#endif
25 24
26/* 25/*
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 30beb11ef928..076d1cc44f95 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -530,7 +530,8 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
530 */ 530 */
531 if ((journal->j_fs_dev != journal->j_dev) && 531 if ((journal->j_fs_dev != journal->j_dev) &&
532 (journal->j_flags & JBD2_BARRIER)) 532 (journal->j_flags & JBD2_BARRIER))
533 blkdev_issue_flush(journal->j_fs_dev, NULL); 533 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL,
534 BLKDEV_IFL_WAIT);
534 if (!(journal->j_flags & JBD2_ABORT)) 535 if (!(journal->j_flags & JBD2_ABORT))
535 jbd2_journal_update_superblock(journal, 1); 536 jbd2_journal_update_superblock(journal, 1);
536 return 0; 537 return 0;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 671da7fb7ffd..75716d3d2be0 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -717,7 +717,8 @@ start_journal_io:
717 if (commit_transaction->t_flushed_data_blocks && 717 if (commit_transaction->t_flushed_data_blocks &&
718 (journal->j_fs_dev != journal->j_dev) && 718 (journal->j_fs_dev != journal->j_dev) &&
719 (journal->j_flags & JBD2_BARRIER)) 719 (journal->j_flags & JBD2_BARRIER))
720 blkdev_issue_flush(journal->j_fs_dev, NULL); 720 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL,
721 BLKDEV_IFL_WAIT);
721 722
722 /* Done it all: now write the commit record asynchronously. */ 723 /* Done it all: now write the commit record asynchronously. */
723 if (JBD2_HAS_INCOMPAT_FEATURE(journal, 724 if (JBD2_HAS_INCOMPAT_FEATURE(journal,
@@ -727,7 +728,8 @@ start_journal_io:
727 if (err) 728 if (err)
728 __jbd2_journal_abort_hard(journal); 729 __jbd2_journal_abort_hard(journal);
729 if (journal->j_flags & JBD2_BARRIER) 730 if (journal->j_flags & JBD2_BARRIER)
730 blkdev_issue_flush(journal->j_dev, NULL); 731 blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL,
732 BLKDEV_IFL_WAIT);
731 } 733 }
732 734
733 err = journal_finish_inode_data_buffers(journal, commit_transaction); 735 err = journal_finish_inode_data_buffers(journal, commit_transaction);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index c03d4dce4d76..bc2ff5932769 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1889,7 +1889,7 @@ static struct kmem_cache *get_slab(size_t size)
1889 BUG_ON(i >= JBD2_MAX_SLABS); 1889 BUG_ON(i >= JBD2_MAX_SLABS);
1890 if (unlikely(i < 0)) 1890 if (unlikely(i < 0))
1891 i = 0; 1891 i = 0;
1892 BUG_ON(jbd2_slab[i] == 0); 1892 BUG_ON(jbd2_slab[i] == NULL);
1893 return jbd2_slab[i]; 1893 return jbd2_slab[i];
1894} 1894}
1895 1895
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 73063285b13f..049281b7cb89 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -20,7 +20,6 @@
20#include <linux/fs.h> 20#include <linux/fs.h>
21#include <linux/jbd2.h> 21#include <linux/jbd2.h>
22#include <linux/errno.h> 22#include <linux/errno.h>
23#include <linux/slab.h>
24#include <linux/crc32.h> 23#include <linux/crc32.h>
25#endif 24#endif
26 25
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index bfc70f57900f..e214d68620ac 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1311,7 +1311,6 @@ int jbd2_journal_stop(handle_t *handle)
1311 if (handle->h_sync) 1311 if (handle->h_sync)
1312 transaction->t_synchronous_commit = 1; 1312 transaction->t_synchronous_commit = 1;
1313 current->journal_info = NULL; 1313 current->journal_info = NULL;
1314 spin_lock(&journal->j_state_lock);
1315 spin_lock(&transaction->t_handle_lock); 1314 spin_lock(&transaction->t_handle_lock);
1316 transaction->t_outstanding_credits -= handle->h_buffer_credits; 1315 transaction->t_outstanding_credits -= handle->h_buffer_credits;
1317 transaction->t_updates--; 1316 transaction->t_updates--;
@@ -1340,8 +1339,7 @@ int jbd2_journal_stop(handle_t *handle)
1340 jbd_debug(2, "transaction too old, requesting commit for " 1339 jbd_debug(2, "transaction too old, requesting commit for "
1341 "handle %p\n", handle); 1340 "handle %p\n", handle);
1342 /* This is non-blocking */ 1341 /* This is non-blocking */
1343 __jbd2_log_start_commit(journal, transaction->t_tid); 1342 jbd2_log_start_commit(journal, transaction->t_tid);
1344 spin_unlock(&journal->j_state_lock);
1345 1343
1346 /* 1344 /*
1347 * Special case: JBD2_SYNC synchronous updates require us 1345 * Special case: JBD2_SYNC synchronous updates require us
@@ -1351,7 +1349,6 @@ int jbd2_journal_stop(handle_t *handle)
1351 err = jbd2_log_wait_commit(journal, tid); 1349 err = jbd2_log_wait_commit(journal, tid);
1352 } else { 1350 } else {
1353 spin_unlock(&transaction->t_handle_lock); 1351 spin_unlock(&transaction->t_handle_lock);
1354 spin_unlock(&journal->j_state_lock);
1355 } 1352 }
1356 1353
1357 lock_map_release(&handle->h_lockdep_map); 1354 lock_map_release(&handle->h_lockdep_map);
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 7cdc3196476a..a33aab6b5e68 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -419,7 +419,7 @@ static int jffs2_acl_setxattr(struct dentry *dentry, const char *name,
419 return rc; 419 return rc;
420} 420}
421 421
422struct xattr_handler jffs2_acl_access_xattr_handler = { 422const struct xattr_handler jffs2_acl_access_xattr_handler = {
423 .prefix = POSIX_ACL_XATTR_ACCESS, 423 .prefix = POSIX_ACL_XATTR_ACCESS,
424 .flags = ACL_TYPE_DEFAULT, 424 .flags = ACL_TYPE_DEFAULT,
425 .list = jffs2_acl_access_listxattr, 425 .list = jffs2_acl_access_listxattr,
@@ -427,7 +427,7 @@ struct xattr_handler jffs2_acl_access_xattr_handler = {
427 .set = jffs2_acl_setxattr, 427 .set = jffs2_acl_setxattr,
428}; 428};
429 429
430struct xattr_handler jffs2_acl_default_xattr_handler = { 430const struct xattr_handler jffs2_acl_default_xattr_handler = {
431 .prefix = POSIX_ACL_XATTR_DEFAULT, 431 .prefix = POSIX_ACL_XATTR_DEFAULT,
432 .flags = ACL_TYPE_DEFAULT, 432 .flags = ACL_TYPE_DEFAULT,
433 .list = jffs2_acl_default_listxattr, 433 .list = jffs2_acl_default_listxattr,
diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h
index f0ba63e3c36b..5e42de8d9541 100644
--- a/fs/jffs2/acl.h
+++ b/fs/jffs2/acl.h
@@ -31,8 +31,8 @@ extern int jffs2_acl_chmod(struct inode *);
31extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *); 31extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *);
32extern int jffs2_init_acl_post(struct inode *); 32extern int jffs2_init_acl_post(struct inode *);
33 33
34extern struct xattr_handler jffs2_acl_access_xattr_handler; 34extern const struct xattr_handler jffs2_acl_access_xattr_handler;
35extern struct xattr_handler jffs2_acl_default_xattr_handler; 35extern const struct xattr_handler jffs2_acl_default_xattr_handler;
36 36
37#else 37#else
38 38
diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c
index 3ff50da94789..55f1dde2fa8b 100644
--- a/fs/jffs2/background.c
+++ b/fs/jffs2/background.c
@@ -23,10 +23,9 @@ static int jffs2_garbage_collect_thread(void *);
23 23
24void jffs2_garbage_collect_trigger(struct jffs2_sb_info *c) 24void jffs2_garbage_collect_trigger(struct jffs2_sb_info *c)
25{ 25{
26 spin_lock(&c->erase_completion_lock); 26 assert_spin_locked(&c->erase_completion_lock);
27 if (c->gc_task && jffs2_thread_should_wake(c)) 27 if (c->gc_task && jffs2_thread_should_wake(c))
28 send_sig(SIGHUP, c->gc_task, 1); 28 send_sig(SIGHUP, c->gc_task, 1);
29 spin_unlock(&c->erase_completion_lock);
30} 29}
31 30
32/* This must only ever be called when no GC thread is currently running */ 31/* This must only ever be called when no GC thread is currently running */
diff --git a/fs/jffs2/compr_lzo.c b/fs/jffs2/compr_lzo.c
index 90cb60d09787..cd02acafde8a 100644
--- a/fs/jffs2/compr_lzo.c
+++ b/fs/jffs2/compr_lzo.c
@@ -11,7 +11,6 @@
11 11
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/sched.h> 13#include <linux/sched.h>
14#include <linux/slab.h>
15#include <linux/vmalloc.h> 14#include <linux/vmalloc.h>
16#include <linux/init.h> 15#include <linux/init.h>
17#include <linux/lzo.h> 16#include <linux/lzo.h>
diff --git a/fs/jffs2/compr_zlib.c b/fs/jffs2/compr_zlib.c
index cfd301a5edfc..b46661a42758 100644
--- a/fs/jffs2/compr_zlib.c
+++ b/fs/jffs2/compr_zlib.c
@@ -14,7 +14,6 @@
14#endif 14#endif
15 15
16#include <linux/kernel.h> 16#include <linux/kernel.h>
17#include <linux/slab.h>
18#include <linux/zlib.h> 17#include <linux/zlib.h>
19#include <linux/zutil.h> 18#include <linux/zutil.h>
20#include "nodelist.h" 19#include "nodelist.h"
diff --git a/fs/jffs2/debug.c b/fs/jffs2/debug.c
index 5544d31c066b..ec3538413926 100644
--- a/fs/jffs2/debug.c
+++ b/fs/jffs2/debug.c
@@ -15,6 +15,7 @@
15#include <linux/crc32.h> 15#include <linux/crc32.h>
16#include <linux/jffs2.h> 16#include <linux/jffs2.h>
17#include <linux/mtd/mtd.h> 17#include <linux/mtd/mtd.h>
18#include <linux/slab.h>
18#include "nodelist.h" 19#include "nodelist.h"
19#include "debug.h" 20#include "debug.h"
20 21
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index b47679be118a..6286ad9b00f7 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -103,9 +103,10 @@ static void jffs2_erase_block(struct jffs2_sb_info *c,
103 jffs2_erase_failed(c, jeb, bad_offset); 103 jffs2_erase_failed(c, jeb, bad_offset);
104} 104}
105 105
106void jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count) 106int jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count)
107{ 107{
108 struct jffs2_eraseblock *jeb; 108 struct jffs2_eraseblock *jeb;
109 int work_done = 0;
109 110
110 mutex_lock(&c->erase_free_sem); 111 mutex_lock(&c->erase_free_sem);
111 112
@@ -121,6 +122,7 @@ void jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count)
121 mutex_unlock(&c->erase_free_sem); 122 mutex_unlock(&c->erase_free_sem);
122 jffs2_mark_erased_block(c, jeb); 123 jffs2_mark_erased_block(c, jeb);
123 124
125 work_done++;
124 if (!--count) { 126 if (!--count) {
125 D1(printk(KERN_DEBUG "Count reached. jffs2_erase_pending_blocks leaving\n")); 127 D1(printk(KERN_DEBUG "Count reached. jffs2_erase_pending_blocks leaving\n"));
126 goto done; 128 goto done;
@@ -157,6 +159,7 @@ void jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count)
157 mutex_unlock(&c->erase_free_sem); 159 mutex_unlock(&c->erase_free_sem);
158 done: 160 done:
159 D1(printk(KERN_DEBUG "jffs2_erase_pending_blocks completed\n")); 161 D1(printk(KERN_DEBUG "jffs2_erase_pending_blocks completed\n"));
162 return work_done;
160} 163}
161 164
162static void jffs2_erase_succeeded(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb) 165static void jffs2_erase_succeeded(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb)
@@ -165,10 +168,11 @@ static void jffs2_erase_succeeded(struct jffs2_sb_info *c, struct jffs2_eraseblo
165 mutex_lock(&c->erase_free_sem); 168 mutex_lock(&c->erase_free_sem);
166 spin_lock(&c->erase_completion_lock); 169 spin_lock(&c->erase_completion_lock);
167 list_move_tail(&jeb->list, &c->erase_complete_list); 170 list_move_tail(&jeb->list, &c->erase_complete_list);
171 /* Wake the GC thread to mark them clean */
172 jffs2_garbage_collect_trigger(c);
168 spin_unlock(&c->erase_completion_lock); 173 spin_unlock(&c->erase_completion_lock);
169 mutex_unlock(&c->erase_free_sem); 174 mutex_unlock(&c->erase_free_sem);
170 /* Ensure that kupdated calls us again to mark them clean */ 175 wake_up(&c->erase_wait);
171 jffs2_erase_pending_trigger(c);
172} 176}
173 177
174static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, uint32_t bad_offset) 178static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, uint32_t bad_offset)
@@ -487,9 +491,9 @@ filebad:
487 491
488refile: 492refile:
489 /* Stick it back on the list from whence it came and come back later */ 493 /* Stick it back on the list from whence it came and come back later */
490 jffs2_erase_pending_trigger(c);
491 mutex_lock(&c->erase_free_sem); 494 mutex_lock(&c->erase_free_sem);
492 spin_lock(&c->erase_completion_lock); 495 spin_lock(&c->erase_completion_lock);
496 jffs2_garbage_collect_trigger(c);
493 list_move(&jeb->list, &c->erase_complete_list); 497 list_move(&jeb->list, &c->erase_complete_list);
494 spin_unlock(&c->erase_completion_lock); 498 spin_unlock(&c->erase_completion_lock);
495 mutex_unlock(&c->erase_free_sem); 499 mutex_unlock(&c->erase_free_sem);
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index b7b74e299142..813497024437 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -10,7 +10,6 @@
10 */ 10 */
11 11
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/slab.h>
14#include <linux/fs.h> 13#include <linux/fs.h>
15#include <linux/time.h> 14#include <linux/time.h>
16#include <linux/pagemap.h> 15#include <linux/pagemap.h>
@@ -27,9 +26,9 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
27 struct page **pagep, void **fsdata); 26 struct page **pagep, void **fsdata);
28static int jffs2_readpage (struct file *filp, struct page *pg); 27static int jffs2_readpage (struct file *filp, struct page *pg);
29 28
30int jffs2_fsync(struct file *filp, struct dentry *dentry, int datasync) 29int jffs2_fsync(struct file *filp, int datasync)
31{ 30{
32 struct inode *inode = dentry->d_inode; 31 struct inode *inode = filp->f_mapping->host;
33 struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); 32 struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
34 33
35 /* Trigger GC to flush any pending writes for this inode */ 34 /* Trigger GC to flush any pending writes for this inode */
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 3451a81b2142..8bc2c80ab159 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -169,13 +169,13 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
169 mutex_unlock(&f->sem); 169 mutex_unlock(&f->sem);
170 jffs2_complete_reservation(c); 170 jffs2_complete_reservation(c);
171 171
172 /* We have to do the vmtruncate() without f->sem held, since 172 /* We have to do the simple_setsize() without f->sem held, since
173 some pages may be locked and waiting for it in readpage(). 173 some pages may be locked and waiting for it in readpage().
174 We are protected from a simultaneous write() extending i_size 174 We are protected from a simultaneous write() extending i_size
175 back past iattr->ia_size, because do_truncate() holds the 175 back past iattr->ia_size, because do_truncate() holds the
176 generic inode semaphore. */ 176 generic inode semaphore. */
177 if (ivalid & ATTR_SIZE && inode->i_size > iattr->ia_size) { 177 if (ivalid & ATTR_SIZE && inode->i_size > iattr->ia_size) {
178 vmtruncate(inode, iattr->ia_size); 178 simple_setsize(inode, iattr->ia_size);
179 inode->i_blocks = (inode->i_size + 511) >> 9; 179 inode->i_blocks = (inode->i_size + 511) >> 9;
180 } 180 }
181 181
@@ -313,8 +313,8 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino)
313 case S_IFBLK: 313 case S_IFBLK:
314 case S_IFCHR: 314 case S_IFCHR:
315 /* Read the device numbers from the media */ 315 /* Read the device numbers from the media */
316 if (f->metadata->size != sizeof(jdev.old) && 316 if (f->metadata->size != sizeof(jdev.old_id) &&
317 f->metadata->size != sizeof(jdev.new)) { 317 f->metadata->size != sizeof(jdev.new_id)) {
318 printk(KERN_NOTICE "Device node has strange size %d\n", f->metadata->size); 318 printk(KERN_NOTICE "Device node has strange size %d\n", f->metadata->size);
319 goto error_io; 319 goto error_io;
320 } 320 }
@@ -325,10 +325,10 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino)
325 printk(KERN_NOTICE "Read device numbers for inode %lu failed\n", (unsigned long)inode->i_ino); 325 printk(KERN_NOTICE "Read device numbers for inode %lu failed\n", (unsigned long)inode->i_ino);
326 goto error; 326 goto error;
327 } 327 }
328 if (f->metadata->size == sizeof(jdev.old)) 328 if (f->metadata->size == sizeof(jdev.old_id))
329 rdev = old_decode_dev(je16_to_cpu(jdev.old)); 329 rdev = old_decode_dev(je16_to_cpu(jdev.old_id));
330 else 330 else
331 rdev = new_decode_dev(je32_to_cpu(jdev.new)); 331 rdev = new_decode_dev(je32_to_cpu(jdev.new_id));
332 332
333 case S_IFSOCK: 333 case S_IFSOCK:
334 case S_IFIFO: 334 case S_IFIFO:
diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c
index 3b6f2fa12cff..f5e96bd656e8 100644
--- a/fs/jffs2/gc.c
+++ b/fs/jffs2/gc.c
@@ -214,6 +214,19 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
214 return ret; 214 return ret;
215 } 215 }
216 216
217 /* If there are any blocks which need erasing, erase them now */
218 if (!list_empty(&c->erase_complete_list) ||
219 !list_empty(&c->erase_pending_list)) {
220 spin_unlock(&c->erase_completion_lock);
221 D1(printk(KERN_DEBUG "jffs2_garbage_collect_pass() erasing pending blocks\n"));
222 if (jffs2_erase_pending_blocks(c, 1)) {
223 mutex_unlock(&c->alloc_sem);
224 return 0;
225 }
226 D1(printk(KERN_DEBUG "No progress from erasing blocks; doing GC anyway\n"));
227 spin_lock(&c->erase_completion_lock);
228 }
229
217 /* First, work out which block we're garbage-collecting */ 230 /* First, work out which block we're garbage-collecting */
218 jeb = c->gcblock; 231 jeb = c->gcblock;
219 232
@@ -222,7 +235,7 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
222 235
223 if (!jeb) { 236 if (!jeb) {
224 /* Couldn't find a free block. But maybe we can just erase one and make 'progress'? */ 237 /* Couldn't find a free block. But maybe we can just erase one and make 'progress'? */
225 if (!list_empty(&c->erase_pending_list)) { 238 if (c->nr_erasing_blocks) {
226 spin_unlock(&c->erase_completion_lock); 239 spin_unlock(&c->erase_completion_lock);
227 mutex_unlock(&c->alloc_sem); 240 mutex_unlock(&c->alloc_sem);
228 return -EAGAIN; 241 return -EAGAIN;
@@ -435,7 +448,7 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
435 list_add_tail(&c->gcblock->list, &c->erase_pending_list); 448 list_add_tail(&c->gcblock->list, &c->erase_pending_list);
436 c->gcblock = NULL; 449 c->gcblock = NULL;
437 c->nr_erasing_blocks++; 450 c->nr_erasing_blocks++;
438 jffs2_erase_pending_trigger(c); 451 jffs2_garbage_collect_trigger(c);
439 } 452 }
440 spin_unlock(&c->erase_completion_lock); 453 spin_unlock(&c->erase_completion_lock);
441 454
diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c
index 87c6f555e1a0..af02bd138469 100644
--- a/fs/jffs2/nodelist.c
+++ b/fs/jffs2/nodelist.c
@@ -15,7 +15,6 @@
15#include <linux/mtd/mtd.h> 15#include <linux/mtd/mtd.h>
16#include <linux/rbtree.h> 16#include <linux/rbtree.h>
17#include <linux/crc32.h> 17#include <linux/crc32.h>
18#include <linux/slab.h>
19#include <linux/pagemap.h> 18#include <linux/pagemap.h>
20#include "nodelist.h" 19#include "nodelist.h"
21 20
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index 507ed6ec1847..a881a42f19e3 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -312,11 +312,11 @@ static inline int jffs2_blocks_use_vmalloc(struct jffs2_sb_info *c)
312static inline int jffs2_encode_dev(union jffs2_device_node *jdev, dev_t rdev) 312static inline int jffs2_encode_dev(union jffs2_device_node *jdev, dev_t rdev)
313{ 313{
314 if (old_valid_dev(rdev)) { 314 if (old_valid_dev(rdev)) {
315 jdev->old = cpu_to_je16(old_encode_dev(rdev)); 315 jdev->old_id = cpu_to_je16(old_encode_dev(rdev));
316 return sizeof(jdev->old); 316 return sizeof(jdev->old_id);
317 } else { 317 } else {
318 jdev->new = cpu_to_je32(new_encode_dev(rdev)); 318 jdev->new_id = cpu_to_je32(new_encode_dev(rdev));
319 return sizeof(jdev->new); 319 return sizeof(jdev->new_id);
320 } 320 }
321} 321}
322 322
@@ -464,7 +464,7 @@ int jffs2_scan_dirty_space(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb
464int jffs2_do_mount_fs(struct jffs2_sb_info *c); 464int jffs2_do_mount_fs(struct jffs2_sb_info *c);
465 465
466/* erase.c */ 466/* erase.c */
467void jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count); 467int jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count);
468void jffs2_free_jeb_node_refs(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb); 468void jffs2_free_jeb_node_refs(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb);
469 469
470#ifdef CONFIG_JFFS2_FS_WRITEBUFFER 470#ifdef CONFIG_JFFS2_FS_WRITEBUFFER
diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index 21a052915aa9..694aa5b03505 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -10,7 +10,6 @@
10 */ 10 */
11 11
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/slab.h>
14#include <linux/mtd/mtd.h> 13#include <linux/mtd/mtd.h>
15#include <linux/compiler.h> 14#include <linux/compiler.h>
16#include <linux/sched.h> /* For cond_resched() */ 15#include <linux/sched.h> /* For cond_resched() */
@@ -117,9 +116,21 @@ int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize,
117 116
118 ret = jffs2_garbage_collect_pass(c); 117 ret = jffs2_garbage_collect_pass(c);
119 118
120 if (ret == -EAGAIN) 119 if (ret == -EAGAIN) {
121 jffs2_erase_pending_blocks(c, 1); 120 spin_lock(&c->erase_completion_lock);
122 else if (ret) 121 if (c->nr_erasing_blocks &&
122 list_empty(&c->erase_pending_list) &&
123 list_empty(&c->erase_complete_list)) {
124 DECLARE_WAITQUEUE(wait, current);
125 set_current_state(TASK_UNINTERRUPTIBLE);
126 add_wait_queue(&c->erase_wait, &wait);
127 D1(printk(KERN_DEBUG "%s waiting for erase to complete\n", __func__));
128 spin_unlock(&c->erase_completion_lock);
129
130 schedule();
131 } else
132 spin_unlock(&c->erase_completion_lock);
133 } else if (ret)
123 return ret; 134 return ret;
124 135
125 cond_resched(); 136 cond_resched();
@@ -218,7 +229,7 @@ static int jffs2_find_nextblock(struct jffs2_sb_info *c)
218 ejeb = list_entry(c->erasable_list.next, struct jffs2_eraseblock, list); 229 ejeb = list_entry(c->erasable_list.next, struct jffs2_eraseblock, list);
219 list_move_tail(&ejeb->list, &c->erase_pending_list); 230 list_move_tail(&ejeb->list, &c->erase_pending_list);
220 c->nr_erasing_blocks++; 231 c->nr_erasing_blocks++;
221 jffs2_erase_pending_trigger(c); 232 jffs2_garbage_collect_trigger(c);
222 D1(printk(KERN_DEBUG "jffs2_find_nextblock: Triggering erase of erasable block at 0x%08x\n", 233 D1(printk(KERN_DEBUG "jffs2_find_nextblock: Triggering erase of erasable block at 0x%08x\n",
223 ejeb->offset)); 234 ejeb->offset));
224 } 235 }
@@ -470,7 +481,9 @@ struct jffs2_raw_node_ref *jffs2_add_physical_node_ref(struct jffs2_sb_info *c,
470void jffs2_complete_reservation(struct jffs2_sb_info *c) 481void jffs2_complete_reservation(struct jffs2_sb_info *c)
471{ 482{
472 D1(printk(KERN_DEBUG "jffs2_complete_reservation()\n")); 483 D1(printk(KERN_DEBUG "jffs2_complete_reservation()\n"));
484 spin_lock(&c->erase_completion_lock);
473 jffs2_garbage_collect_trigger(c); 485 jffs2_garbage_collect_trigger(c);
486 spin_unlock(&c->erase_completion_lock);
474 mutex_unlock(&c->alloc_sem); 487 mutex_unlock(&c->alloc_sem);
475} 488}
476 489
@@ -612,7 +625,7 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
612 D1(printk(KERN_DEBUG "...and adding to erase_pending_list\n")); 625 D1(printk(KERN_DEBUG "...and adding to erase_pending_list\n"));
613 list_add_tail(&jeb->list, &c->erase_pending_list); 626 list_add_tail(&jeb->list, &c->erase_pending_list);
614 c->nr_erasing_blocks++; 627 c->nr_erasing_blocks++;
615 jffs2_erase_pending_trigger(c); 628 jffs2_garbage_collect_trigger(c);
616 } else { 629 } else {
617 /* Sometimes, however, we leave it elsewhere so it doesn't get 630 /* Sometimes, however, we leave it elsewhere so it doesn't get
618 immediately reused, and we spread the load a bit. */ 631 immediately reused, and we spread the load a bit. */
@@ -733,6 +746,10 @@ int jffs2_thread_should_wake(struct jffs2_sb_info *c)
733 int nr_very_dirty = 0; 746 int nr_very_dirty = 0;
734 struct jffs2_eraseblock *jeb; 747 struct jffs2_eraseblock *jeb;
735 748
749 if (!list_empty(&c->erase_complete_list) ||
750 !list_empty(&c->erase_pending_list))
751 return 1;
752
736 if (c->unchecked_size) { 753 if (c->unchecked_size) {
737 D1(printk(KERN_DEBUG "jffs2_thread_should_wake(): unchecked_size %d, checked_ino #%d\n", 754 D1(printk(KERN_DEBUG "jffs2_thread_should_wake(): unchecked_size %d, checked_ino #%d\n",
738 c->unchecked_size, c->checked_ino)); 755 c->unchecked_size, c->checked_ino));
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index a7f03b7ebcb3..4791aacf3084 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -140,8 +140,7 @@ void jffs2_nor_wbuf_flash_cleanup(struct jffs2_sb_info *c);
140 140
141#endif /* WRITEBUFFER */ 141#endif /* WRITEBUFFER */
142 142
143/* erase.c */ 143static inline void jffs2_dirty_trigger(struct jffs2_sb_info *c)
144static inline void jffs2_erase_pending_trigger(struct jffs2_sb_info *c)
145{ 144{
146 OFNI_BS_2SFFJ(c)->s_dirt = 1; 145 OFNI_BS_2SFFJ(c)->s_dirt = 1;
147} 146}
@@ -159,7 +158,7 @@ extern const struct inode_operations jffs2_dir_inode_operations;
159extern const struct file_operations jffs2_file_operations; 158extern const struct file_operations jffs2_file_operations;
160extern const struct inode_operations jffs2_file_inode_operations; 159extern const struct inode_operations jffs2_file_inode_operations;
161extern const struct address_space_operations jffs2_file_address_operations; 160extern const struct address_space_operations jffs2_file_address_operations;
162int jffs2_fsync(struct file *, struct dentry *, int); 161int jffs2_fsync(struct file *, int);
163int jffs2_do_readpage_unlock (struct inode *inode, struct page *pg); 162int jffs2_do_readpage_unlock (struct inode *inode, struct page *pg);
164 163
165/* ioctl.c */ 164/* ioctl.c */
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 696686cc206e..46f870d1cc36 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -260,7 +260,9 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
260 ret = -EIO; 260 ret = -EIO;
261 goto out; 261 goto out;
262 } 262 }
263 jffs2_erase_pending_trigger(c); 263 spin_lock(&c->erase_completion_lock);
264 jffs2_garbage_collect_trigger(c);
265 spin_unlock(&c->erase_completion_lock);
264 } 266 }
265 ret = 0; 267 ret = 0;
266 out: 268 out:
diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c
index eaccee058583..239f51216a68 100644
--- a/fs/jffs2/security.c
+++ b/fs/jffs2/security.c
@@ -77,7 +77,7 @@ static size_t jffs2_security_listxattr(struct dentry *dentry, char *list,
77 return retlen; 77 return retlen;
78} 78}
79 79
80struct xattr_handler jffs2_security_xattr_handler = { 80const struct xattr_handler jffs2_security_xattr_handler = {
81 .prefix = XATTR_SECURITY_PREFIX, 81 .prefix = XATTR_SECURITY_PREFIX,
82 .list = jffs2_security_listxattr, 82 .list = jffs2_security_listxattr,
83 .set = jffs2_security_setxattr, 83 .set = jffs2_security_setxattr,
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 9a80e8e595d0..511e2d609d12 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -63,8 +63,6 @@ static void jffs2_write_super(struct super_block *sb)
63 63
64 if (!(sb->s_flags & MS_RDONLY)) { 64 if (!(sb->s_flags & MS_RDONLY)) {
65 D1(printk(KERN_DEBUG "jffs2_write_super()\n")); 65 D1(printk(KERN_DEBUG "jffs2_write_super()\n"));
66 jffs2_garbage_collect_trigger(c);
67 jffs2_erase_pending_blocks(c, 0);
68 jffs2_flush_wbuf_gc(c, 0); 66 jffs2_flush_wbuf_gc(c, 0);
69 } 67 }
70 68
diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c
index 4ec11e8bda8c..b955626071c2 100644
--- a/fs/jffs2/symlink.c
+++ b/fs/jffs2/symlink.c
@@ -10,7 +10,6 @@
10 */ 10 */
11 11
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/slab.h>
14#include <linux/fs.h> 13#include <linux/fs.h>
15#include <linux/namei.h> 14#include <linux/namei.h>
16#include "nodelist.h" 15#include "nodelist.h"
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 5ef7bac265e5..07ee1546b2fa 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -84,7 +84,7 @@ static void jffs2_wbuf_dirties_inode(struct jffs2_sb_info *c, uint32_t ino)
84 struct jffs2_inodirty *new; 84 struct jffs2_inodirty *new;
85 85
86 /* Mark the superblock dirty so that kupdated will flush... */ 86 /* Mark the superblock dirty so that kupdated will flush... */
87 jffs2_erase_pending_trigger(c); 87 jffs2_dirty_trigger(c);
88 88
89 if (jffs2_wbuf_pending_for_ino(c, ino)) 89 if (jffs2_wbuf_pending_for_ino(c, ino))
90 return; 90 return;
@@ -121,7 +121,7 @@ static inline void jffs2_refile_wbuf_blocks(struct jffs2_sb_info *c)
121 D1(printk(KERN_DEBUG "...and adding to erase_pending_list\n")); 121 D1(printk(KERN_DEBUG "...and adding to erase_pending_list\n"));
122 list_add_tail(&jeb->list, &c->erase_pending_list); 122 list_add_tail(&jeb->list, &c->erase_pending_list);
123 c->nr_erasing_blocks++; 123 c->nr_erasing_blocks++;
124 jffs2_erase_pending_trigger(c); 124 jffs2_garbage_collect_trigger(c);
125 } else { 125 } else {
126 /* Sometimes, however, we leave it elsewhere so it doesn't get 126 /* Sometimes, however, we leave it elsewhere so it doesn't get
127 immediately reused, and we spread the load a bit. */ 127 immediately reused, and we spread the load a bit. */
@@ -152,7 +152,7 @@ static void jffs2_block_refile(struct jffs2_sb_info *c, struct jffs2_eraseblock
152 D1(printk("Refiling block at %08x to erase_pending_list\n", jeb->offset)); 152 D1(printk("Refiling block at %08x to erase_pending_list\n", jeb->offset));
153 list_add(&jeb->list, &c->erase_pending_list); 153 list_add(&jeb->list, &c->erase_pending_list);
154 c->nr_erasing_blocks++; 154 c->nr_erasing_blocks++;
155 jffs2_erase_pending_trigger(c); 155 jffs2_garbage_collect_trigger(c);
156 } 156 }
157 157
158 if (!jffs2_prealloc_raw_node_refs(c, jeb, 1)) { 158 if (!jffs2_prealloc_raw_node_refs(c, jeb, 1)) {
@@ -543,7 +543,7 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
543 D1(printk(KERN_DEBUG "Failing block at %08x is now empty. Moving to erase_pending_list\n", jeb->offset)); 543 D1(printk(KERN_DEBUG "Failing block at %08x is now empty. Moving to erase_pending_list\n", jeb->offset));
544 list_move(&jeb->list, &c->erase_pending_list); 544 list_move(&jeb->list, &c->erase_pending_list);
545 c->nr_erasing_blocks++; 545 c->nr_erasing_blocks++;
546 jffs2_erase_pending_trigger(c); 546 jffs2_garbage_collect_trigger(c);
547 } 547 }
548 548
549 jffs2_dbg_acct_sanity_check_nolock(c, jeb); 549 jffs2_dbg_acct_sanity_check_nolock(c, jeb);
diff --git a/fs/jffs2/write.c b/fs/jffs2/write.c
index ca29440e9435..c819eb0e982d 100644
--- a/fs/jffs2/write.c
+++ b/fs/jffs2/write.c
@@ -12,7 +12,6 @@
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/fs.h> 13#include <linux/fs.h>
14#include <linux/crc32.h> 14#include <linux/crc32.h>
15#include <linux/slab.h>
16#include <linux/pagemap.h> 15#include <linux/pagemap.h>
17#include <linux/mtd/mtd.h> 16#include <linux/mtd/mtd.h>
18#include "nodelist.h" 17#include "nodelist.h"
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index 9e75c62c85d6..a2d58c96f1b4 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -904,7 +904,7 @@ struct jffs2_xattr_datum *jffs2_setup_xattr_datum(struct jffs2_sb_info *c,
904 * do_jffs2_setxattr(inode, xprefix, xname, buffer, size, flags) 904 * do_jffs2_setxattr(inode, xprefix, xname, buffer, size, flags)
905 * is an implementation of setxattr handler on jffs2. 905 * is an implementation of setxattr handler on jffs2.
906 * -------------------------------------------------- */ 906 * -------------------------------------------------- */
907struct xattr_handler *jffs2_xattr_handlers[] = { 907const struct xattr_handler *jffs2_xattr_handlers[] = {
908 &jffs2_user_xattr_handler, 908 &jffs2_user_xattr_handler,
909#ifdef CONFIG_JFFS2_FS_SECURITY 909#ifdef CONFIG_JFFS2_FS_SECURITY
910 &jffs2_security_xattr_handler, 910 &jffs2_security_xattr_handler,
@@ -917,8 +917,8 @@ struct xattr_handler *jffs2_xattr_handlers[] = {
917 NULL 917 NULL
918}; 918};
919 919
920static struct xattr_handler *xprefix_to_handler(int xprefix) { 920static const struct xattr_handler *xprefix_to_handler(int xprefix) {
921 struct xattr_handler *ret; 921 const struct xattr_handler *ret;
922 922
923 switch (xprefix) { 923 switch (xprefix) {
924 case JFFS2_XPREFIX_USER: 924 case JFFS2_XPREFIX_USER:
@@ -955,7 +955,7 @@ ssize_t jffs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
955 struct jffs2_inode_cache *ic = f->inocache; 955 struct jffs2_inode_cache *ic = f->inocache;
956 struct jffs2_xattr_ref *ref, **pref; 956 struct jffs2_xattr_ref *ref, **pref;
957 struct jffs2_xattr_datum *xd; 957 struct jffs2_xattr_datum *xd;
958 struct xattr_handler *xhandle; 958 const struct xattr_handler *xhandle;
959 ssize_t len, rc; 959 ssize_t len, rc;
960 int retry = 0; 960 int retry = 0;
961 961
diff --git a/fs/jffs2/xattr.h b/fs/jffs2/xattr.h
index 6e3b5ddfb7ab..cf4f5759b42b 100644
--- a/fs/jffs2/xattr.h
+++ b/fs/jffs2/xattr.h
@@ -93,9 +93,9 @@ extern int do_jffs2_getxattr(struct inode *inode, int xprefix, const char *xname
93extern int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname, 93extern int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname,
94 const char *buffer, size_t size, int flags); 94 const char *buffer, size_t size, int flags);
95 95
96extern struct xattr_handler *jffs2_xattr_handlers[]; 96extern const struct xattr_handler *jffs2_xattr_handlers[];
97extern struct xattr_handler jffs2_user_xattr_handler; 97extern const struct xattr_handler jffs2_user_xattr_handler;
98extern struct xattr_handler jffs2_trusted_xattr_handler; 98extern const struct xattr_handler jffs2_trusted_xattr_handler;
99 99
100extern ssize_t jffs2_listxattr(struct dentry *, char *, size_t); 100extern ssize_t jffs2_listxattr(struct dentry *, char *, size_t);
101#define jffs2_getxattr generic_getxattr 101#define jffs2_getxattr generic_getxattr
@@ -122,7 +122,7 @@ extern ssize_t jffs2_listxattr(struct dentry *, char *, size_t);
122 122
123#ifdef CONFIG_JFFS2_FS_SECURITY 123#ifdef CONFIG_JFFS2_FS_SECURITY
124extern int jffs2_init_security(struct inode *inode, struct inode *dir); 124extern int jffs2_init_security(struct inode *inode, struct inode *dir);
125extern struct xattr_handler jffs2_security_xattr_handler; 125extern const struct xattr_handler jffs2_security_xattr_handler;
126#else 126#else
127#define jffs2_init_security(inode,dir) (0) 127#define jffs2_init_security(inode,dir) (0)
128#endif /* CONFIG_JFFS2_FS_SECURITY */ 128#endif /* CONFIG_JFFS2_FS_SECURITY */
diff --git a/fs/jffs2/xattr_trusted.c b/fs/jffs2/xattr_trusted.c
index 3e5a5e356e05..1c868194c504 100644
--- a/fs/jffs2/xattr_trusted.c
+++ b/fs/jffs2/xattr_trusted.c
@@ -47,7 +47,7 @@ static size_t jffs2_trusted_listxattr(struct dentry *dentry, char *list,
47 return retlen; 47 return retlen;
48} 48}
49 49
50struct xattr_handler jffs2_trusted_xattr_handler = { 50const struct xattr_handler jffs2_trusted_xattr_handler = {
51 .prefix = XATTR_TRUSTED_PREFIX, 51 .prefix = XATTR_TRUSTED_PREFIX,
52 .list = jffs2_trusted_listxattr, 52 .list = jffs2_trusted_listxattr,
53 .set = jffs2_trusted_setxattr, 53 .set = jffs2_trusted_setxattr,
diff --git a/fs/jffs2/xattr_user.c b/fs/jffs2/xattr_user.c
index 8544af67dffe..916b5c966039 100644
--- a/fs/jffs2/xattr_user.c
+++ b/fs/jffs2/xattr_user.c
@@ -47,7 +47,7 @@ static size_t jffs2_user_listxattr(struct dentry *dentry, char *list,
47 return retlen; 47 return retlen;
48} 48}
49 49
50struct xattr_handler jffs2_user_xattr_handler = { 50const struct xattr_handler jffs2_user_xattr_handler = {
51 .prefix = XATTR_USER_PREFIX, 51 .prefix = XATTR_USER_PREFIX,
52 .list = jffs2_user_listxattr, 52 .list = jffs2_user_listxattr,
53 .set = jffs2_user_setxattr, 53 .set = jffs2_user_setxattr,
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index 213169780b6c..1057a4998e4e 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -19,6 +19,7 @@
19 */ 19 */
20 20
21#include <linux/sched.h> 21#include <linux/sched.h>
22#include <linux/slab.h>
22#include <linux/fs.h> 23#include <linux/fs.h>
23#include <linux/posix_acl_xattr.h> 24#include <linux/posix_acl_xattr.h>
24#include "jfs_incore.h" 25#include "jfs_incore.h"
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 14ba982b3f24..127263cc8657 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -27,9 +27,9 @@
27#include "jfs_acl.h" 27#include "jfs_acl.h"
28#include "jfs_debug.h" 28#include "jfs_debug.h"
29 29
30int jfs_fsync(struct file *file, struct dentry *dentry, int datasync) 30int jfs_fsync(struct file *file, int datasync)
31{ 31{
32 struct inode *inode = dentry->d_inode; 32 struct inode *inode = file->f_mapping->host;
33 int rc = 0; 33 int rc = 0;
34 34
35 if (!(inode->i_state & I_DIRTY) || 35 if (!(inode->i_state & I_DIRTY) ||
@@ -98,7 +98,7 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
98 if (rc) 98 if (rc)
99 return rc; 99 return rc;
100 100
101 if (iattr->ia_valid & ATTR_SIZE) 101 if (is_quota_modification(inode, iattr))
102 dquot_initialize(inode); 102 dquot_initialize(inode);
103 if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) || 103 if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
104 (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) { 104 (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 9dd126276c9f..ed9ba6fe04f5 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -61,7 +61,7 @@ struct inode *jfs_iget(struct super_block *sb, unsigned long ino)
61 inode->i_op = &page_symlink_inode_operations; 61 inode->i_op = &page_symlink_inode_operations;
62 inode->i_mapping->a_ops = &jfs_aops; 62 inode->i_mapping->a_ops = &jfs_aops;
63 } else { 63 } else {
64 inode->i_op = &jfs_symlink_inode_operations; 64 inode->i_op = &jfs_fast_symlink_inode_operations;
65 /* 65 /*
66 * The inline data should be null-terminated, but 66 * The inline data should be null-terminated, but
67 * don't let on-disk corruption crash the kernel 67 * don't let on-disk corruption crash the kernel
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index d9b031cf69f5..c92ea3b3ea5e 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/fs.h> 19#include <linux/fs.h>
20#include <linux/slab.h>
20#include "jfs_incore.h" 21#include "jfs_incore.h"
21#include "jfs_superblock.h" 22#include "jfs_superblock.h"
22#include "jfs_dmap.h" 23#include "jfs_dmap.h"
@@ -195,7 +196,7 @@ int dbMount(struct inode *ipbmap)
195 bmp->db_maxag = le32_to_cpu(dbmp_le->dn_maxag); 196 bmp->db_maxag = le32_to_cpu(dbmp_le->dn_maxag);
196 bmp->db_agpref = le32_to_cpu(dbmp_le->dn_agpref); 197 bmp->db_agpref = le32_to_cpu(dbmp_le->dn_agpref);
197 bmp->db_aglevel = le32_to_cpu(dbmp_le->dn_aglevel); 198 bmp->db_aglevel = le32_to_cpu(dbmp_le->dn_aglevel);
198 bmp->db_agheigth = le32_to_cpu(dbmp_le->dn_agheigth); 199 bmp->db_agheight = le32_to_cpu(dbmp_le->dn_agheight);
199 bmp->db_agwidth = le32_to_cpu(dbmp_le->dn_agwidth); 200 bmp->db_agwidth = le32_to_cpu(dbmp_le->dn_agwidth);
200 bmp->db_agstart = le32_to_cpu(dbmp_le->dn_agstart); 201 bmp->db_agstart = le32_to_cpu(dbmp_le->dn_agstart);
201 bmp->db_agl2size = le32_to_cpu(dbmp_le->dn_agl2size); 202 bmp->db_agl2size = le32_to_cpu(dbmp_le->dn_agl2size);
@@ -287,7 +288,7 @@ int dbSync(struct inode *ipbmap)
287 dbmp_le->dn_maxag = cpu_to_le32(bmp->db_maxag); 288 dbmp_le->dn_maxag = cpu_to_le32(bmp->db_maxag);
288 dbmp_le->dn_agpref = cpu_to_le32(bmp->db_agpref); 289 dbmp_le->dn_agpref = cpu_to_le32(bmp->db_agpref);
289 dbmp_le->dn_aglevel = cpu_to_le32(bmp->db_aglevel); 290 dbmp_le->dn_aglevel = cpu_to_le32(bmp->db_aglevel);
290 dbmp_le->dn_agheigth = cpu_to_le32(bmp->db_agheigth); 291 dbmp_le->dn_agheight = cpu_to_le32(bmp->db_agheight);
291 dbmp_le->dn_agwidth = cpu_to_le32(bmp->db_agwidth); 292 dbmp_le->dn_agwidth = cpu_to_le32(bmp->db_agwidth);
292 dbmp_le->dn_agstart = cpu_to_le32(bmp->db_agstart); 293 dbmp_le->dn_agstart = cpu_to_le32(bmp->db_agstart);
293 dbmp_le->dn_agl2size = cpu_to_le32(bmp->db_agl2size); 294 dbmp_le->dn_agl2size = cpu_to_le32(bmp->db_agl2size);
@@ -1440,7 +1441,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
1440 * tree index of this allocation group within the control page. 1441 * tree index of this allocation group within the control page.
1441 */ 1442 */
1442 agperlev = 1443 agperlev =
1443 (1 << (L2LPERCTL - (bmp->db_agheigth << 1))) / bmp->db_agwidth; 1444 (1 << (L2LPERCTL - (bmp->db_agheight << 1))) / bmp->db_agwidth;
1444 ti = bmp->db_agstart + bmp->db_agwidth * (agno & (agperlev - 1)); 1445 ti = bmp->db_agstart + bmp->db_agwidth * (agno & (agperlev - 1));
1445 1446
1446 /* dmap control page trees fan-out by 4 and a single allocation 1447 /* dmap control page trees fan-out by 4 and a single allocation
@@ -1459,7 +1460,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
1459 * the subtree to find the leftmost leaf that describes this 1460 * the subtree to find the leftmost leaf that describes this
1460 * free space. 1461 * free space.
1461 */ 1462 */
1462 for (k = bmp->db_agheigth; k > 0; k--) { 1463 for (k = bmp->db_agheight; k > 0; k--) {
1463 for (n = 0, m = (ti << 2) + 1; n < 4; n++) { 1464 for (n = 0, m = (ti << 2) + 1; n < 4; n++) {
1464 if (l2nb <= dcp->stree[m + n]) { 1465 if (l2nb <= dcp->stree[m + n]) {
1465 ti = m + n; 1466 ti = m + n;
@@ -2437,7 +2438,7 @@ dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level)
2437 2438
2438 /* check if this is a control page update for an allocation. 2439 /* check if this is a control page update for an allocation.
2439 * if so, update the leaf to reflect the new leaf value using 2440 * if so, update the leaf to reflect the new leaf value using
2440 * dbSplit(); otherwise (deallocation), use dbJoin() to udpate 2441 * dbSplit(); otherwise (deallocation), use dbJoin() to update
2441 * the leaf with the new value. in addition to updating the 2442 * the leaf with the new value. in addition to updating the
2442 * leaf, dbSplit() will also split the binary buddy system of 2443 * leaf, dbSplit() will also split the binary buddy system of
2443 * the leaves, if required, and bubble new values within the 2444 * the leaves, if required, and bubble new values within the
@@ -3606,7 +3607,7 @@ void dbFinalizeBmap(struct inode *ipbmap)
3606 } 3607 }
3607 3608
3608 /* 3609 /*
3609 * compute db_aglevel, db_agheigth, db_width, db_agstart: 3610 * compute db_aglevel, db_agheight, db_width, db_agstart:
3610 * an ag is covered in aglevel dmapctl summary tree, 3611 * an ag is covered in aglevel dmapctl summary tree,
3611 * at agheight level height (from leaf) with agwidth number of nodes 3612 * at agheight level height (from leaf) with agwidth number of nodes
3612 * each, which starts at agstart index node of the smmary tree node 3613 * each, which starts at agstart index node of the smmary tree node
@@ -3615,9 +3616,9 @@ void dbFinalizeBmap(struct inode *ipbmap)
3615 bmp->db_aglevel = BMAPSZTOLEV(bmp->db_agsize); 3616 bmp->db_aglevel = BMAPSZTOLEV(bmp->db_agsize);
3616 l2nl = 3617 l2nl =
3617 bmp->db_agl2size - (L2BPERDMAP + bmp->db_aglevel * L2LPERCTL); 3618 bmp->db_agl2size - (L2BPERDMAP + bmp->db_aglevel * L2LPERCTL);
3618 bmp->db_agheigth = l2nl >> 1; 3619 bmp->db_agheight = l2nl >> 1;
3619 bmp->db_agwidth = 1 << (l2nl - (bmp->db_agheigth << 1)); 3620 bmp->db_agwidth = 1 << (l2nl - (bmp->db_agheight << 1));
3620 for (i = 5 - bmp->db_agheigth, bmp->db_agstart = 0, n = 1; i > 0; 3621 for (i = 5 - bmp->db_agheight, bmp->db_agstart = 0, n = 1; i > 0;
3621 i--) { 3622 i--) {
3622 bmp->db_agstart += n; 3623 bmp->db_agstart += n;
3623 n <<= 2; 3624 n <<= 2;
diff --git a/fs/jfs/jfs_dmap.h b/fs/jfs/jfs_dmap.h
index 1a6eb41569bc..6dcb906c55d8 100644
--- a/fs/jfs/jfs_dmap.h
+++ b/fs/jfs/jfs_dmap.h
@@ -210,7 +210,7 @@ struct dbmap_disk {
210 __le32 dn_maxag; /* 4: max active alloc group number */ 210 __le32 dn_maxag; /* 4: max active alloc group number */
211 __le32 dn_agpref; /* 4: preferred alloc group (hint) */ 211 __le32 dn_agpref; /* 4: preferred alloc group (hint) */
212 __le32 dn_aglevel; /* 4: dmapctl level holding the AG */ 212 __le32 dn_aglevel; /* 4: dmapctl level holding the AG */
213 __le32 dn_agheigth; /* 4: height in dmapctl of the AG */ 213 __le32 dn_agheight; /* 4: height in dmapctl of the AG */
214 __le32 dn_agwidth; /* 4: width in dmapctl of the AG */ 214 __le32 dn_agwidth; /* 4: width in dmapctl of the AG */
215 __le32 dn_agstart; /* 4: start tree index at AG height */ 215 __le32 dn_agstart; /* 4: start tree index at AG height */
216 __le32 dn_agl2size; /* 4: l2 num of blks per alloc group */ 216 __le32 dn_agl2size; /* 4: l2 num of blks per alloc group */
@@ -229,7 +229,7 @@ struct dbmap {
229 int dn_maxag; /* max active alloc group number */ 229 int dn_maxag; /* max active alloc group number */
230 int dn_agpref; /* preferred alloc group (hint) */ 230 int dn_agpref; /* preferred alloc group (hint) */
231 int dn_aglevel; /* dmapctl level holding the AG */ 231 int dn_aglevel; /* dmapctl level holding the AG */
232 int dn_agheigth; /* height in dmapctl of the AG */ 232 int dn_agheight; /* height in dmapctl of the AG */
233 int dn_agwidth; /* width in dmapctl of the AG */ 233 int dn_agwidth; /* width in dmapctl of the AG */
234 int dn_agstart; /* start tree index at AG height */ 234 int dn_agstart; /* start tree index at AG height */
235 int dn_agl2size; /* l2 num of blks per alloc group */ 235 int dn_agl2size; /* l2 num of blks per alloc group */
@@ -255,7 +255,7 @@ struct bmap {
255#define db_agsize db_bmap.dn_agsize 255#define db_agsize db_bmap.dn_agsize
256#define db_agl2size db_bmap.dn_agl2size 256#define db_agl2size db_bmap.dn_agl2size
257#define db_agwidth db_bmap.dn_agwidth 257#define db_agwidth db_bmap.dn_agwidth
258#define db_agheigth db_bmap.dn_agheigth 258#define db_agheight db_bmap.dn_agheight
259#define db_agstart db_bmap.dn_agstart 259#define db_agstart db_bmap.dn_agstart
260#define db_numag db_bmap.dn_numag 260#define db_numag db_bmap.dn_numag
261#define db_maxlevel db_bmap.dn_maxlevel 261#define db_maxlevel db_bmap.dn_maxlevel
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
index 0e4623be70ce..9197a1b0d02d 100644
--- a/fs/jfs/jfs_dtree.c
+++ b/fs/jfs/jfs_dtree.c
@@ -102,6 +102,7 @@
102 102
103#include <linux/fs.h> 103#include <linux/fs.h>
104#include <linux/quotaops.h> 104#include <linux/quotaops.h>
105#include <linux/slab.h>
105#include "jfs_incore.h" 106#include "jfs_incore.h"
106#include "jfs_superblock.h" 107#include "jfs_superblock.h"
107#include "jfs_filsys.h" 108#include "jfs_filsys.h"
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 0fc30407f039..f8332dc8eeb2 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -45,6 +45,7 @@
45#include <linux/buffer_head.h> 45#include <linux/buffer_head.h>
46#include <linux/pagemap.h> 46#include <linux/pagemap.h>
47#include <linux/quotaops.h> 47#include <linux/quotaops.h>
48#include <linux/slab.h>
48 49
49#include "jfs_incore.h" 50#include "jfs_incore.h"
50#include "jfs_inode.h" 51#include "jfs_inode.h"
diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c
index 829921b67765..2686531e235a 100644
--- a/fs/jfs/jfs_inode.c
+++ b/fs/jfs/jfs_inode.c
@@ -98,14 +98,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
98 goto fail_unlock; 98 goto fail_unlock;
99 } 99 }
100 100
101 inode->i_uid = current_fsuid(); 101 inode_init_owner(inode, parent, mode);
102 if (parent->i_mode & S_ISGID) {
103 inode->i_gid = parent->i_gid;
104 if (S_ISDIR(mode))
105 mode |= S_ISGID;
106 } else
107 inode->i_gid = current_fsgid();
108
109 /* 102 /*
110 * New inodes need to save sane values on disk when 103 * New inodes need to save sane values on disk when
111 * uid & gid mount options are used 104 * uid & gid mount options are used
@@ -121,7 +114,6 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
121 if (rc) 114 if (rc)
122 goto fail_drop; 115 goto fail_drop;
123 116
124 inode->i_mode = mode;
125 /* inherit flags from parent */ 117 /* inherit flags from parent */
126 jfs_inode->mode2 = JFS_IP(parent)->mode2 & JFS_FL_INHERIT; 118 jfs_inode->mode2 = JFS_IP(parent)->mode2 & JFS_FL_INHERIT;
127 119
@@ -134,7 +126,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
134 if (S_ISLNK(mode)) 126 if (S_ISLNK(mode))
135 jfs_inode->mode2 &= ~(JFS_IMMUTABLE_FL|JFS_APPEND_FL); 127 jfs_inode->mode2 &= ~(JFS_IMMUTABLE_FL|JFS_APPEND_FL);
136 } 128 }
137 jfs_inode->mode2 |= mode; 129 jfs_inode->mode2 |= inode->i_mode;
138 130
139 inode->i_blocks = 0; 131 inode->i_blocks = 0;
140 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 132 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
index 79e2c79661df..11042b1f44b5 100644
--- a/fs/jfs/jfs_inode.h
+++ b/fs/jfs/jfs_inode.h
@@ -21,7 +21,7 @@
21struct fid; 21struct fid;
22 22
23extern struct inode *ialloc(struct inode *, umode_t); 23extern struct inode *ialloc(struct inode *, umode_t);
24extern int jfs_fsync(struct file *, struct dentry *, int); 24extern int jfs_fsync(struct file *, int);
25extern long jfs_ioctl(struct file *, unsigned int, unsigned long); 25extern long jfs_ioctl(struct file *, unsigned int, unsigned long);
26extern long jfs_compat_ioctl(struct file *, unsigned int, unsigned long); 26extern long jfs_compat_ioctl(struct file *, unsigned int, unsigned long);
27extern struct inode *jfs_iget(struct super_block *, unsigned long); 27extern struct inode *jfs_iget(struct super_block *, unsigned long);
@@ -48,5 +48,6 @@ extern const struct file_operations jfs_dir_operations;
48extern const struct inode_operations jfs_file_inode_operations; 48extern const struct inode_operations jfs_file_inode_operations;
49extern const struct file_operations jfs_file_operations; 49extern const struct file_operations jfs_file_operations;
50extern const struct inode_operations jfs_symlink_inode_operations; 50extern const struct inode_operations jfs_symlink_inode_operations;
51extern const struct inode_operations jfs_fast_symlink_inode_operations;
51extern const struct dentry_operations jfs_ci_dentry_operations; 52extern const struct dentry_operations jfs_ci_dentry_operations;
52#endif /* _H_JFS_INODE */ 53#endif /* _H_JFS_INODE */
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 335c4de6552d..c51af2a14516 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -70,6 +70,7 @@
70#include <linux/delay.h> 70#include <linux/delay.h>
71#include <linux/mutex.h> 71#include <linux/mutex.h>
72#include <linux/seq_file.h> 72#include <linux/seq_file.h>
73#include <linux/slab.h>
73#include "jfs_incore.h" 74#include "jfs_incore.h"
74#include "jfs_filsys.h" 75#include "jfs_filsys.h"
75#include "jfs_metapage.h" 76#include "jfs_metapage.h"
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 07b6c5dfb4b6..48b44bd8267b 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -21,6 +21,7 @@
21#include <linux/mm.h> 21#include <linux/mm.h>
22#include <linux/module.h> 22#include <linux/module.h>
23#include <linux/bio.h> 23#include <linux/bio.h>
24#include <linux/slab.h>
24#include <linux/init.h> 25#include <linux/init.h>
25#include <linux/buffer_head.h> 26#include <linux/buffer_head.h>
26#include <linux/mempool.h> 27#include <linux/mempool.h>
diff --git a/fs/jfs/jfs_unicode.h b/fs/jfs/jfs_unicode.h
index 3fbb3a225590..8f0f02cb6ca6 100644
--- a/fs/jfs/jfs_unicode.h
+++ b/fs/jfs/jfs_unicode.h
@@ -19,6 +19,7 @@
19#ifndef _H_JFS_UNICODE 19#ifndef _H_JFS_UNICODE
20#define _H_JFS_UNICODE 20#define _H_JFS_UNICODE
21 21
22#include <linux/slab.h>
22#include <asm/byteorder.h> 23#include <asm/byteorder.h>
23#include "jfs_types.h" 24#include "jfs_types.h"
24 25
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 4a3e9f39c21d..a9cf8e8675be 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -956,7 +956,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
956 */ 956 */
957 957
958 if (ssize <= IDATASIZE) { 958 if (ssize <= IDATASIZE) {
959 ip->i_op = &jfs_symlink_inode_operations; 959 ip->i_op = &jfs_fast_symlink_inode_operations;
960 960
961 i_fastsymlink = JFS_IP(ip)->i_inline; 961 i_fastsymlink = JFS_IP(ip)->i_inline;
962 memcpy(i_fastsymlink, name, ssize); 962 memcpy(i_fastsymlink, name, ssize);
@@ -978,7 +978,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
978 else { 978 else {
979 jfs_info("jfs_symlink: allocate extent ip:0x%p", ip); 979 jfs_info("jfs_symlink: allocate extent ip:0x%p", ip);
980 980
981 ip->i_op = &page_symlink_inode_operations; 981 ip->i_op = &jfs_symlink_inode_operations;
982 ip->i_mapping->a_ops = &jfs_aops; 982 ip->i_mapping->a_ops = &jfs_aops;
983 983
984 /* 984 /*
diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c
index 7f24a0bb08ca..1aba0039f1c9 100644
--- a/fs/jfs/resize.c
+++ b/fs/jfs/resize.c
@@ -81,6 +81,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
81 struct inode *iplist[1]; 81 struct inode *iplist[1];
82 struct jfs_superblock *j_sb, *j_sb2; 82 struct jfs_superblock *j_sb, *j_sb2;
83 uint old_agsize; 83 uint old_agsize;
84 int agsizechanged = 0;
84 struct buffer_head *bh, *bh2; 85 struct buffer_head *bh, *bh2;
85 86
86 /* If the volume hasn't grown, get out now */ 87 /* If the volume hasn't grown, get out now */
@@ -333,6 +334,9 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
333 */ 334 */
334 if ((rc = dbExtendFS(ipbmap, XAddress, nblocks))) 335 if ((rc = dbExtendFS(ipbmap, XAddress, nblocks)))
335 goto error_out; 336 goto error_out;
337
338 agsizechanged |= (bmp->db_agsize != old_agsize);
339
336 /* 340 /*
337 * the map now has extended to cover additional nblocks: 341 * the map now has extended to cover additional nblocks:
338 * dn_mapsize = oldMapsize + nblocks; 342 * dn_mapsize = oldMapsize + nblocks;
@@ -432,7 +436,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
432 * will correctly identify the new ag); 436 * will correctly identify the new ag);
433 */ 437 */
434 /* if new AG size the same as old AG size, done! */ 438 /* if new AG size the same as old AG size, done! */
435 if (bmp->db_agsize != old_agsize) { 439 if (agsizechanged) {
436 if ((rc = diExtendFS(ipimap, ipbmap))) 440 if ((rc = diExtendFS(ipimap, ipbmap)))
437 goto error_out; 441 goto error_out;
438 442
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 266699deb1c6..b38f96bef829 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -30,6 +30,7 @@
30#include <linux/buffer_head.h> 30#include <linux/buffer_head.h>
31#include <linux/exportfs.h> 31#include <linux/exportfs.h>
32#include <linux/crc32.h> 32#include <linux/crc32.h>
33#include <linux/slab.h>
33#include <asm/uaccess.h> 34#include <asm/uaccess.h>
34#include <linux/seq_file.h> 35#include <linux/seq_file.h>
35#include <linux/smp_lock.h> 36#include <linux/smp_lock.h>
@@ -178,6 +179,8 @@ static void jfs_put_super(struct super_block *sb)
178 179
179 jfs_info("In jfs_put_super"); 180 jfs_info("In jfs_put_super");
180 181
182 dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
183
181 lock_kernel(); 184 lock_kernel();
182 185
183 rc = jfs_umount(sb); 186 rc = jfs_umount(sb);
@@ -395,10 +398,20 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
395 398
396 JFS_SBI(sb)->flag = flag; 399 JFS_SBI(sb)->flag = flag;
397 ret = jfs_mount_rw(sb, 1); 400 ret = jfs_mount_rw(sb, 1);
401
402 /* mark the fs r/w for quota activity */
403 sb->s_flags &= ~MS_RDONLY;
404
398 unlock_kernel(); 405 unlock_kernel();
406 dquot_resume(sb, -1);
399 return ret; 407 return ret;
400 } 408 }
401 if ((!(sb->s_flags & MS_RDONLY)) && (*flags & MS_RDONLY)) { 409 if ((!(sb->s_flags & MS_RDONLY)) && (*flags & MS_RDONLY)) {
410 rc = dquot_suspend(sb, -1);
411 if (rc < 0) {
412 unlock_kernel();
413 return rc;
414 }
402 rc = jfs_umount_rw(sb); 415 rc = jfs_umount_rw(sb);
403 JFS_SBI(sb)->flag = flag; 416 JFS_SBI(sb)->flag = flag;
404 unlock_kernel(); 417 unlock_kernel();
@@ -445,10 +458,8 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
445 /* initialize the mount flag and determine the default error handler */ 458 /* initialize the mount flag and determine the default error handler */
446 flag = JFS_ERR_REMOUNT_RO; 459 flag = JFS_ERR_REMOUNT_RO;
447 460
448 if (!parse_options((char *) data, sb, &newLVSize, &flag)) { 461 if (!parse_options((char *) data, sb, &newLVSize, &flag))
449 kfree(sbi); 462 goto out_kfree;
450 return -EINVAL;
451 }
452 sbi->flag = flag; 463 sbi->flag = flag;
453 464
454#ifdef CONFIG_JFS_POSIX_ACL 465#ifdef CONFIG_JFS_POSIX_ACL
@@ -457,7 +468,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
457 468
458 if (newLVSize) { 469 if (newLVSize) {
459 printk(KERN_ERR "resize option for remount only\n"); 470 printk(KERN_ERR "resize option for remount only\n");
460 return -EINVAL; 471 goto out_kfree;
461 } 472 }
462 473
463 /* 474 /*
@@ -470,6 +481,10 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
470 */ 481 */
471 sb->s_op = &jfs_super_operations; 482 sb->s_op = &jfs_super_operations;
472 sb->s_export_op = &jfs_export_operations; 483 sb->s_export_op = &jfs_export_operations;
484#ifdef CONFIG_QUOTA
485 sb->dq_op = &dquot_operations;
486 sb->s_qcop = &dquot_quotactl_ops;
487#endif
473 488
474 /* 489 /*
475 * Initialize direct-mapping inode/address-space 490 * Initialize direct-mapping inode/address-space
@@ -477,7 +492,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
477 inode = new_inode(sb); 492 inode = new_inode(sb);
478 if (inode == NULL) { 493 if (inode == NULL) {
479 ret = -ENOMEM; 494 ret = -ENOMEM;
480 goto out_kfree; 495 goto out_unload;
481 } 496 }
482 inode->i_ino = 0; 497 inode->i_ino = 0;
483 inode->i_nlink = 1; 498 inode->i_nlink = 1;
@@ -549,9 +564,10 @@ out_mount_failed:
549 make_bad_inode(sbi->direct_inode); 564 make_bad_inode(sbi->direct_inode);
550 iput(sbi->direct_inode); 565 iput(sbi->direct_inode);
551 sbi->direct_inode = NULL; 566 sbi->direct_inode = NULL;
552out_kfree: 567out_unload:
553 if (sbi->nls_tab) 568 if (sbi->nls_tab)
554 unload_nls(sbi->nls_tab); 569 unload_nls(sbi->nls_tab);
570out_kfree:
555 kfree(sbi); 571 kfree(sbi);
556 return ret; 572 return ret;
557} 573}
diff --git a/fs/jfs/symlink.c b/fs/jfs/symlink.c
index 4af1a05aad0a..205b946d8e0d 100644
--- a/fs/jfs/symlink.c
+++ b/fs/jfs/symlink.c
@@ -29,9 +29,21 @@ static void *jfs_follow_link(struct dentry *dentry, struct nameidata *nd)
29 return NULL; 29 return NULL;
30} 30}
31 31
32const struct inode_operations jfs_symlink_inode_operations = { 32const struct inode_operations jfs_fast_symlink_inode_operations = {
33 .readlink = generic_readlink, 33 .readlink = generic_readlink,
34 .follow_link = jfs_follow_link, 34 .follow_link = jfs_follow_link,
35 .setattr = jfs_setattr,
36 .setxattr = jfs_setxattr,
37 .getxattr = jfs_getxattr,
38 .listxattr = jfs_listxattr,
39 .removexattr = jfs_removexattr,
40};
41
42const struct inode_operations jfs_symlink_inode_operations = {
43 .readlink = generic_readlink,
44 .follow_link = page_follow_link_light,
45 .put_link = page_put_link,
46 .setattr = jfs_setattr,
35 .setxattr = jfs_setxattr, 47 .setxattr = jfs_setxattr,
36 .getxattr = jfs_getxattr, 48 .getxattr = jfs_getxattr,
37 .listxattr = jfs_listxattr, 49 .listxattr = jfs_listxattr,
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index 1f594ab21895..fa96bbb26343 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -21,6 +21,7 @@
21#include <linux/fs.h> 21#include <linux/fs.h>
22#include <linux/xattr.h> 22#include <linux/xattr.h>
23#include <linux/posix_acl_xattr.h> 23#include <linux/posix_acl_xattr.h>
24#include <linux/slab.h>
24#include <linux/quotaops.h> 25#include <linux/quotaops.h>
25#include <linux/security.h> 26#include <linux/security.h>
26#include "jfs_incore.h" 27#include "jfs_incore.h"
diff --git a/fs/libfs.c b/fs/libfs.c
index 9e50bcf55857..09e1016eb774 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -5,8 +5,10 @@
5 5
6#include <linux/module.h> 6#include <linux/module.h>
7#include <linux/pagemap.h> 7#include <linux/pagemap.h>
8#include <linux/slab.h>
8#include <linux/mount.h> 9#include <linux/mount.h>
9#include <linux/vfs.h> 10#include <linux/vfs.h>
11#include <linux/quotaops.h>
10#include <linux/mutex.h> 12#include <linux/mutex.h>
11#include <linux/exportfs.h> 13#include <linux/exportfs.h>
12#include <linux/writeback.h> 14#include <linux/writeback.h>
@@ -57,11 +59,6 @@ struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, struct na
57 return NULL; 59 return NULL;
58} 60}
59 61
60int simple_sync_file(struct file * file, struct dentry *dentry, int datasync)
61{
62 return 0;
63}
64
65int dcache_dir_open(struct inode *inode, struct file *file) 62int dcache_dir_open(struct inode *inode, struct file *file)
66{ 63{
67 static struct qstr cursor_name = {.len = 1, .name = "."}; 64 static struct qstr cursor_name = {.len = 1, .name = "."};
@@ -189,7 +186,7 @@ const struct file_operations simple_dir_operations = {
189 .llseek = dcache_dir_lseek, 186 .llseek = dcache_dir_lseek,
190 .read = generic_read_dir, 187 .read = generic_read_dir,
191 .readdir = dcache_readdir, 188 .readdir = dcache_readdir,
192 .fsync = simple_sync_file, 189 .fsync = noop_fsync,
193}; 190};
194 191
195const struct inode_operations simple_dir_inode_operations = { 192const struct inode_operations simple_dir_inode_operations = {
@@ -329,6 +326,81 @@ int simple_rename(struct inode *old_dir, struct dentry *old_dentry,
329 return 0; 326 return 0;
330} 327}
331 328
329/**
330 * simple_setsize - handle core mm and vfs requirements for file size change
331 * @inode: inode
332 * @newsize: new file size
333 *
334 * Returns 0 on success, -error on failure.
335 *
336 * simple_setsize must be called with inode_mutex held.
337 *
338 * simple_setsize will check that the requested new size is OK (see
339 * inode_newsize_ok), and then will perform the necessary i_size update
340 * and pagecache truncation (if necessary). It will be typically be called
341 * from the filesystem's setattr function when ATTR_SIZE is passed in.
342 *
343 * The inode itself must have correct permissions and attributes to allow
344 * i_size to be changed, this function then just checks that the new size
345 * requested is valid.
346 *
347 * In the case of simple in-memory filesystems with inodes stored solely
348 * in the inode cache, and file data in the pagecache, nothing more needs
349 * to be done to satisfy a truncate request. Filesystems with on-disk
350 * blocks for example will need to free them in the case of truncate, in
351 * that case it may be easier not to use simple_setsize (but each of its
352 * components will likely be required at some point to update pagecache
353 * and inode etc).
354 */
355int simple_setsize(struct inode *inode, loff_t newsize)
356{
357 loff_t oldsize;
358 int error;
359
360 error = inode_newsize_ok(inode, newsize);
361 if (error)
362 return error;
363
364 oldsize = inode->i_size;
365 i_size_write(inode, newsize);
366 truncate_pagecache(inode, oldsize, newsize);
367
368 return error;
369}
370EXPORT_SYMBOL(simple_setsize);
371
372/**
373 * simple_setattr - setattr for simple in-memory filesystem
374 * @dentry: dentry
375 * @iattr: iattr structure
376 *
377 * Returns 0 on success, -error on failure.
378 *
379 * simple_setattr implements setattr for an in-memory filesystem which
380 * does not store its own file data or metadata (eg. uses the page cache
381 * and inode cache as its data store).
382 */
383int simple_setattr(struct dentry *dentry, struct iattr *iattr)
384{
385 struct inode *inode = dentry->d_inode;
386 int error;
387
388 error = inode_change_ok(inode, iattr);
389 if (error)
390 return error;
391
392 if (iattr->ia_valid & ATTR_SIZE) {
393 error = simple_setsize(inode, iattr->ia_size);
394 if (error)
395 return error;
396 }
397
398 generic_setattr(inode, iattr);
399
400 return error;
401}
402EXPORT_SYMBOL(simple_setattr);
403
332int simple_readpage(struct file *file, struct page *page) 404int simple_readpage(struct file *file, struct page *page)
333{ 405{
334 clear_highpage(page); 406 clear_highpage(page);
@@ -546,6 +618,40 @@ ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos,
546} 618}
547 619
548/** 620/**
621 * simple_write_to_buffer - copy data from user space to the buffer
622 * @to: the buffer to write to
623 * @available: the size of the buffer
624 * @ppos: the current position in the buffer
625 * @from: the user space buffer to read from
626 * @count: the maximum number of bytes to read
627 *
628 * The simple_write_to_buffer() function reads up to @count bytes from the user
629 * space address starting at @from into the buffer @to at offset @ppos.
630 *
631 * On success, the number of bytes written is returned and the offset @ppos is
632 * advanced by this number, or negative value is returned on error.
633 **/
634ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos,
635 const void __user *from, size_t count)
636{
637 loff_t pos = *ppos;
638 size_t res;
639
640 if (pos < 0)
641 return -EINVAL;
642 if (pos >= available || !count)
643 return 0;
644 if (count > available - pos)
645 count = available - pos;
646 res = copy_from_user(to + pos, from, count);
647 if (res == count)
648 return -EFAULT;
649 count -= res;
650 *ppos = pos + count;
651 return count;
652}
653
654/**
549 * memory_read_from_buffer - copy data from the buffer 655 * memory_read_from_buffer - copy data from the buffer
550 * @to: the kernel space buffer to read to 656 * @to: the kernel space buffer to read to
551 * @count: the maximum number of bytes to read 657 * @count: the maximum number of bytes to read
@@ -816,13 +922,22 @@ struct dentry *generic_fh_to_parent(struct super_block *sb, struct fid *fid,
816} 922}
817EXPORT_SYMBOL_GPL(generic_fh_to_parent); 923EXPORT_SYMBOL_GPL(generic_fh_to_parent);
818 924
819int simple_fsync(struct file *file, struct dentry *dentry, int datasync) 925/**
926 * generic_file_fsync - generic fsync implementation for simple filesystems
927 * @file: file to synchronize
928 * @datasync: only synchronize essential metadata if true
929 *
930 * This is a generic implementation of the fsync method for simple
931 * filesystems which track all non-inode metadata in the buffers list
932 * hanging off the address_space structure.
933 */
934int generic_file_fsync(struct file *file, int datasync)
820{ 935{
821 struct writeback_control wbc = { 936 struct writeback_control wbc = {
822 .sync_mode = WB_SYNC_ALL, 937 .sync_mode = WB_SYNC_ALL,
823 .nr_to_write = 0, /* metadata-only; caller takes care of data */ 938 .nr_to_write = 0, /* metadata-only; caller takes care of data */
824 }; 939 };
825 struct inode *inode = dentry->d_inode; 940 struct inode *inode = file->f_mapping->host;
826 int err; 941 int err;
827 int ret; 942 int ret;
828 943
@@ -837,7 +952,15 @@ int simple_fsync(struct file *file, struct dentry *dentry, int datasync)
837 ret = err; 952 ret = err;
838 return ret; 953 return ret;
839} 954}
840EXPORT_SYMBOL(simple_fsync); 955EXPORT_SYMBOL(generic_file_fsync);
956
957/*
958 * No-op implementation of ->fsync for in-memory filesystems.
959 */
960int noop_fsync(struct file *file, int datasync)
961{
962 return 0;
963}
841 964
842EXPORT_SYMBOL(dcache_dir_close); 965EXPORT_SYMBOL(dcache_dir_close);
843EXPORT_SYMBOL(dcache_dir_lseek); 966EXPORT_SYMBOL(dcache_dir_lseek);
@@ -860,9 +983,10 @@ EXPORT_SYMBOL(simple_release_fs);
860EXPORT_SYMBOL(simple_rename); 983EXPORT_SYMBOL(simple_rename);
861EXPORT_SYMBOL(simple_rmdir); 984EXPORT_SYMBOL(simple_rmdir);
862EXPORT_SYMBOL(simple_statfs); 985EXPORT_SYMBOL(simple_statfs);
863EXPORT_SYMBOL(simple_sync_file); 986EXPORT_SYMBOL(noop_fsync);
864EXPORT_SYMBOL(simple_unlink); 987EXPORT_SYMBOL(simple_unlink);
865EXPORT_SYMBOL(simple_read_from_buffer); 988EXPORT_SYMBOL(simple_read_from_buffer);
989EXPORT_SYMBOL(simple_write_to_buffer);
866EXPORT_SYMBOL(memory_read_from_buffer); 990EXPORT_SYMBOL(memory_read_from_buffer);
867EXPORT_SYMBOL(simple_transaction_set); 991EXPORT_SYMBOL(simple_transaction_set);
868EXPORT_SYMBOL(simple_transaction_get); 992EXPORT_SYMBOL(simple_transaction_get);
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index fc9032dc8862..64fd427c993c 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -8,6 +8,7 @@
8 8
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/types.h> 10#include <linux/types.h>
11#include <linux/slab.h>
11#include <linux/time.h> 12#include <linux/time.h>
12#include <linux/nfs_fs.h> 13#include <linux/nfs_fs.h>
13#include <linux/sunrpc/clnt.h> 14#include <linux/sunrpc/clnt.h>
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index c81249fef11f..7932c399fab4 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -8,6 +8,7 @@
8 8
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/smp_lock.h> 10#include <linux/smp_lock.h>
11#include <linux/slab.h>
11#include <linux/types.h> 12#include <linux/types.h>
12#include <linux/errno.h> 13#include <linux/errno.h>
13#include <linux/fs.h> 14#include <linux/fs.h>
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index fefa4df3f005..e3015464fbab 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -10,6 +10,7 @@
10#include <linux/utsname.h> 10#include <linux/utsname.h>
11#include <linux/kernel.h> 11#include <linux/kernel.h>
12#include <linux/ktime.h> 12#include <linux/ktime.h>
13#include <linux/slab.h>
13 14
14#include <linux/sunrpc/clnt.h> 15#include <linux/sunrpc/clnt.h>
15#include <linux/sunrpc/xprtsock.h> 16#include <linux/sunrpc/xprtsock.h>
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 7d150517ddf0..f1bacf1a0391 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -21,7 +21,6 @@
21#include <linux/errno.h> 21#include <linux/errno.h>
22#include <linux/in.h> 22#include <linux/in.h>
23#include <linux/uio.h> 23#include <linux/uio.h>
24#include <linux/slab.h>
25#include <linux/smp.h> 24#include <linux/smp.h>
26#include <linux/smp_lock.h> 25#include <linux/smp_lock.h>
27#include <linux/mutex.h> 26#include <linux/mutex.h>
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index a7966eed3c17..031c6569a134 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -9,7 +9,6 @@
9 9
10#include <linux/types.h> 10#include <linux/types.h>
11#include <linux/time.h> 11#include <linux/time.h>
12#include <linux/slab.h>
13#include <linux/smp_lock.h> 12#include <linux/smp_lock.h>
14#include <linux/lockd/lockd.h> 13#include <linux/lockd/lockd.h>
15#include <linux/lockd/share.h> 14#include <linux/lockd/share.h>
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index d1001790fa9a..84055d31bfc5 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -21,6 +21,7 @@
21 */ 21 */
22 22
23#include <linux/types.h> 23#include <linux/types.h>
24#include <linux/slab.h>
24#include <linux/errno.h> 25#include <linux/errno.h>
25#include <linux/kernel.h> 26#include <linux/kernel.h>
26#include <linux/sched.h> 27#include <linux/sched.h>
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 56c9519d900a..0f2ab741ae7c 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -9,7 +9,6 @@
9 9
10#include <linux/types.h> 10#include <linux/types.h>
11#include <linux/time.h> 11#include <linux/time.h>
12#include <linux/slab.h>
13#include <linux/smp_lock.h> 12#include <linux/smp_lock.h>
14#include <linux/lockd/lockd.h> 13#include <linux/lockd/lockd.h>
15#include <linux/lockd/share.h> 14#include <linux/lockd/share.h>
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index ad478da7ca63..d0ef94cfb3da 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -10,6 +10,7 @@
10#include <linux/string.h> 10#include <linux/string.h>
11#include <linux/time.h> 11#include <linux/time.h>
12#include <linux/in.h> 12#include <linux/in.h>
13#include <linux/slab.h>
13#include <linux/mutex.h> 14#include <linux/mutex.h>
14#include <linux/sunrpc/svc.h> 15#include <linux/sunrpc/svc.h>
15#include <linux/sunrpc/clnt.h> 16#include <linux/sunrpc/clnt.h>
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index 9718c22f186d..9bd2ce2a3040 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -9,6 +9,7 @@
9#include <linux/bio.h> 9#include <linux/bio.h>
10#include <linux/blkdev.h> 10#include <linux/blkdev.h>
11#include <linux/buffer_head.h> 11#include <linux/buffer_head.h>
12#include <linux/gfp.h>
12 13
13#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1)) 14#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1))
14 15
@@ -80,6 +81,7 @@ static void writeseg_end_io(struct bio *bio, int err)
80 prefetchw(&bvec->bv_page->flags); 81 prefetchw(&bvec->bv_page->flags);
81 82
82 end_page_writeback(page); 83 end_page_writeback(page);
84 page_cache_release(page);
83 } while (bvec >= bio->bi_io_vec); 85 } while (bvec >= bio->bi_io_vec);
84 bio_put(bio); 86 bio_put(bio);
85 if (atomic_dec_and_test(&super->s_pending_writes)) 87 if (atomic_dec_and_test(&super->s_pending_writes))
@@ -97,8 +99,10 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
97 unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9); 99 unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9);
98 int i; 100 int i;
99 101
102 if (max_pages > BIO_MAX_PAGES)
103 max_pages = BIO_MAX_PAGES;
100 bio = bio_alloc(GFP_NOFS, max_pages); 104 bio = bio_alloc(GFP_NOFS, max_pages);
101 BUG_ON(!bio); /* FIXME: handle this */ 105 BUG_ON(!bio);
102 106
103 for (i = 0; i < nr_pages; i++) { 107 for (i = 0; i < nr_pages; i++) {
104 if (i >= max_pages) { 108 if (i >= max_pages) {
@@ -191,8 +195,10 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index,
191 unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9); 195 unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9);
192 int i; 196 int i;
193 197
198 if (max_pages > BIO_MAX_PAGES)
199 max_pages = BIO_MAX_PAGES;
194 bio = bio_alloc(GFP_NOFS, max_pages); 200 bio = bio_alloc(GFP_NOFS, max_pages);
195 BUG_ON(!bio); /* FIXME: handle this */ 201 BUG_ON(!bio);
196 202
197 for (i = 0; i < nr_pages; i++) { 203 for (i = 0; i < nr_pages; i++) {
198 if (i >= max_pages) { 204 if (i >= max_pages) {
@@ -297,6 +303,11 @@ static void bdev_put_device(struct super_block *sb)
297 close_bdev_exclusive(logfs_super(sb)->s_bdev, FMODE_READ|FMODE_WRITE); 303 close_bdev_exclusive(logfs_super(sb)->s_bdev, FMODE_READ|FMODE_WRITE);
298} 304}
299 305
306static int bdev_can_write_buf(struct super_block *sb, u64 ofs)
307{
308 return 0;
309}
310
300static const struct logfs_device_ops bd_devops = { 311static const struct logfs_device_ops bd_devops = {
301 .find_first_sb = bdev_find_first_sb, 312 .find_first_sb = bdev_find_first_sb,
302 .find_last_sb = bdev_find_last_sb, 313 .find_last_sb = bdev_find_last_sb,
@@ -304,6 +315,7 @@ static const struct logfs_device_ops bd_devops = {
304 .readpage = bdev_readpage, 315 .readpage = bdev_readpage,
305 .writeseg = bdev_writeseg, 316 .writeseg = bdev_writeseg,
306 .erase = bdev_erase, 317 .erase = bdev_erase,
318 .can_write_buf = bdev_can_write_buf,
307 .sync = bdev_sync, 319 .sync = bdev_sync,
308 .put_device = bdev_put_device, 320 .put_device = bdev_put_device,
309}; 321};
diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c
index cafb6ef2e05b..a85d47d13e4b 100644
--- a/fs/logfs/dev_mtd.c
+++ b/fs/logfs/dev_mtd.c
@@ -9,6 +9,7 @@
9#include <linux/completion.h> 9#include <linux/completion.h>
10#include <linux/mount.h> 10#include <linux/mount.h>
11#include <linux/sched.h> 11#include <linux/sched.h>
12#include <linux/slab.h>
12 13
13#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1)) 14#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1))
14 15
@@ -126,7 +127,8 @@ static int mtd_readpage(void *_sb, struct page *page)
126 127
127 err = mtd_read(sb, page->index << PAGE_SHIFT, PAGE_SIZE, 128 err = mtd_read(sb, page->index << PAGE_SHIFT, PAGE_SIZE,
128 page_address(page)); 129 page_address(page));
129 if (err == -EUCLEAN) { 130 if (err == -EUCLEAN || err == -EBADMSG) {
131 /* -EBADMSG happens regularly on power failures */
130 err = 0; 132 err = 0;
131 /* FIXME: force GC this segment */ 133 /* FIXME: force GC this segment */
132 } 134 }
@@ -233,12 +235,32 @@ static void mtd_put_device(struct super_block *sb)
233 put_mtd_device(logfs_super(sb)->s_mtd); 235 put_mtd_device(logfs_super(sb)->s_mtd);
234} 236}
235 237
238static int mtd_can_write_buf(struct super_block *sb, u64 ofs)
239{
240 struct logfs_super *super = logfs_super(sb);
241 void *buf;
242 int err;
243
244 buf = kmalloc(super->s_writesize, GFP_KERNEL);
245 if (!buf)
246 return -ENOMEM;
247 err = mtd_read(sb, ofs, super->s_writesize, buf);
248 if (err)
249 goto out;
250 if (memchr_inv(buf, 0xff, super->s_writesize))
251 err = -EIO;
252 kfree(buf);
253out:
254 return err;
255}
256
236static const struct logfs_device_ops mtd_devops = { 257static const struct logfs_device_ops mtd_devops = {
237 .find_first_sb = mtd_find_first_sb, 258 .find_first_sb = mtd_find_first_sb,
238 .find_last_sb = mtd_find_last_sb, 259 .find_last_sb = mtd_find_last_sb,
239 .readpage = mtd_readpage, 260 .readpage = mtd_readpage,
240 .writeseg = mtd_writeseg, 261 .writeseg = mtd_writeseg,
241 .erase = mtd_erase, 262 .erase = mtd_erase,
263 .can_write_buf = mtd_can_write_buf,
242 .sync = mtd_sync, 264 .sync = mtd_sync,
243 .put_device = mtd_put_device, 265 .put_device = mtd_put_device,
244}; 266};
@@ -250,5 +272,7 @@ int logfs_get_sb_mtd(struct file_system_type *type, int flags,
250 const struct logfs_device_ops *devops = &mtd_devops; 272 const struct logfs_device_ops *devops = &mtd_devops;
251 273
252 mtd = get_mtd_device(NULL, mtdnr); 274 mtd = get_mtd_device(NULL, mtdnr);
275 if (IS_ERR(mtd))
276 return PTR_ERR(mtd);
253 return logfs_get_sb_device(type, flags, mtd, NULL, devops, mnt); 277 return logfs_get_sb_device(type, flags, mtd, NULL, devops, mnt);
254} 278}
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index 56a8bfbb0120..72d1893ddd36 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -6,13 +6,13 @@
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org> 6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */ 7 */
8#include "logfs.h" 8#include "logfs.h"
9 9#include <linux/slab.h>
10 10
11/* 11/*
12 * Atomic dir operations 12 * Atomic dir operations
13 * 13 *
14 * Directory operations are by default not atomic. Dentries and Inodes are 14 * Directory operations are by default not atomic. Dentries and Inodes are
15 * created/removed/altered in seperate operations. Therefore we need to do 15 * created/removed/altered in separate operations. Therefore we need to do
16 * a small amount of journaling. 16 * a small amount of journaling.
17 * 17 *
18 * Create, link, mkdir, mknod and symlink all share the same function to do 18 * Create, link, mkdir, mknod and symlink all share the same function to do
@@ -303,12 +303,12 @@ static int __logfs_readdir(struct file *file, void *buf, filldir_t filldir)
303 (filler_t *)logfs_readpage, NULL); 303 (filler_t *)logfs_readpage, NULL);
304 if (IS_ERR(page)) 304 if (IS_ERR(page))
305 return PTR_ERR(page); 305 return PTR_ERR(page);
306 dd = kmap_atomic(page, KM_USER0); 306 dd = kmap(page);
307 BUG_ON(dd->namelen == 0); 307 BUG_ON(dd->namelen == 0);
308 308
309 full = filldir(buf, (char *)dd->name, be16_to_cpu(dd->namelen), 309 full = filldir(buf, (char *)dd->name, be16_to_cpu(dd->namelen),
310 pos, be64_to_cpu(dd->ino), dd->type); 310 pos, be64_to_cpu(dd->ino), dd->type);
311 kunmap_atomic(dd, KM_USER0); 311 kunmap(page);
312 page_cache_release(page); 312 page_cache_release(page);
313 if (full) 313 if (full)
314 break; 314 break;
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
index 370f367a933e..abe1cafbd4c2 100644
--- a/fs/logfs/file.c
+++ b/fs/logfs/file.c
@@ -161,7 +161,17 @@ static int logfs_writepage(struct page *page, struct writeback_control *wbc)
161 161
162static void logfs_invalidatepage(struct page *page, unsigned long offset) 162static void logfs_invalidatepage(struct page *page, unsigned long offset)
163{ 163{
164 move_page_to_btree(page); 164 struct logfs_block *block = logfs_block(page);
165
166 if (block->reserved_bytes) {
167 struct super_block *sb = page->mapping->host->i_sb;
168 struct logfs_super *super = logfs_super(sb);
169
170 super->s_dirty_pages -= block->reserved_bytes;
171 block->ops->free_block(sb, block);
172 BUG_ON(bitmap_weight(block->alias_map, LOGFS_BLOCK_FACTOR));
173 } else
174 move_page_to_btree(page);
165 BUG_ON(PagePrivate(page) || page->private); 175 BUG_ON(PagePrivate(page) || page->private);
166} 176}
167 177
@@ -209,13 +219,11 @@ int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
209 } 219 }
210} 220}
211 221
212int logfs_fsync(struct file *file, struct dentry *dentry, int datasync) 222int logfs_fsync(struct file *file, int datasync)
213{ 223{
214 struct super_block *sb = dentry->d_inode->i_sb; 224 struct super_block *sb = file->f_mapping->host->i_sb;
215 struct logfs_super *super = logfs_super(sb);
216 225
217 /* FIXME: write anchor */ 226 logfs_write_anchor(sb);
218 super->s_devops->sync(sb);
219 return 0; 227 return 0;
220} 228}
221 229
diff --git a/fs/logfs/gc.c b/fs/logfs/gc.c
index 92949f95a901..caa4419285dc 100644
--- a/fs/logfs/gc.c
+++ b/fs/logfs/gc.c
@@ -7,6 +7,7 @@
7 */ 7 */
8#include "logfs.h" 8#include "logfs.h"
9#include <linux/sched.h> 9#include <linux/sched.h>
10#include <linux/slab.h>
10 11
11/* 12/*
12 * Wear leveling needs to kick in when the difference between low erase 13 * Wear leveling needs to kick in when the difference between low erase
@@ -121,7 +122,7 @@ static void logfs_cleanse_block(struct super_block *sb, u64 ofs, u64 ino,
121 logfs_safe_iput(inode, cookie); 122 logfs_safe_iput(inode, cookie);
122} 123}
123 124
124static u32 logfs_gc_segment(struct super_block *sb, u32 segno, u8 dist) 125static u32 logfs_gc_segment(struct super_block *sb, u32 segno)
125{ 126{
126 struct logfs_super *super = logfs_super(sb); 127 struct logfs_super *super = logfs_super(sb);
127 struct logfs_segment_header sh; 128 struct logfs_segment_header sh;
@@ -400,7 +401,7 @@ static int __logfs_gc_once(struct super_block *sb, struct gc_candidate *cand)
400 segno, (u64)segno << super->s_segshift, 401 segno, (u64)segno << super->s_segshift,
401 dist, no_free_segments(sb), valid, 402 dist, no_free_segments(sb), valid,
402 super->s_free_bytes); 403 super->s_free_bytes);
403 cleaned = logfs_gc_segment(sb, segno, dist); 404 cleaned = logfs_gc_segment(sb, segno);
404 log_gc("GC segment #%02x complete - now %x valid\n", segno, 405 log_gc("GC segment #%02x complete - now %x valid\n", segno,
405 valid - cleaned); 406 valid - cleaned);
406 BUG_ON(cleaned != valid); 407 BUG_ON(cleaned != valid);
@@ -458,6 +459,14 @@ static void __logfs_gc_pass(struct super_block *sb, int target)
458 struct logfs_block *block; 459 struct logfs_block *block;
459 int round, progress, last_progress = 0; 460 int round, progress, last_progress = 0;
460 461
462 /*
463 * Doing too many changes to the segfile at once would result
464 * in a large number of aliases. Write the journal before
465 * things get out of hand.
466 */
467 if (super->s_shadow_tree.no_shadowed_segments >= MAX_OBJ_ALIASES)
468 logfs_write_anchor(sb);
469
461 if (no_free_segments(sb) >= target && 470 if (no_free_segments(sb) >= target &&
462 super->s_no_object_aliases < MAX_OBJ_ALIASES) 471 super->s_no_object_aliases < MAX_OBJ_ALIASES)
463 return; 472 return;
@@ -623,38 +632,31 @@ static int check_area(struct super_block *sb, int i)
623{ 632{
624 struct logfs_super *super = logfs_super(sb); 633 struct logfs_super *super = logfs_super(sb);
625 struct logfs_area *area = super->s_area[i]; 634 struct logfs_area *area = super->s_area[i];
626 struct logfs_object_header oh; 635 gc_level_t gc_level;
636 u32 cleaned, valid, ec;
627 u32 segno = area->a_segno; 637 u32 segno = area->a_segno;
628 u32 ofs = area->a_used_bytes; 638 u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
629 __be32 crc;
630 int err;
631 639
632 if (!area->a_is_open) 640 if (!area->a_is_open)
633 return 0; 641 return 0;
634 642
635 for (ofs = area->a_used_bytes; 643 if (super->s_devops->can_write_buf(sb, ofs) == 0)
636 ofs <= super->s_segsize - sizeof(oh); 644 return 0;
637 ofs += (u32)be16_to_cpu(oh.len) + sizeof(oh)) {
638 err = wbuf_read(sb, dev_ofs(sb, segno, ofs), sizeof(oh), &oh);
639 if (err)
640 return err;
641
642 if (!memchr_inv(&oh, 0xff, sizeof(oh)))
643 break;
644 645
645 crc = logfs_crc32(&oh, sizeof(oh) - 4, 4); 646 printk(KERN_INFO"LogFS: Possibly incomplete write at %llx\n", ofs);
646 if (crc != oh.crc) { 647 /*
647 printk(KERN_INFO "interrupted header at %llx\n", 648 * The device cannot write back the write buffer. Most likely the
648 dev_ofs(sb, segno, ofs)); 649 * wbuf was already written out and the system crashed at some point
649 return 0; 650 * before the journal commit happened. In that case we wouldn't have
650 } 651 * to do anything. But if the crash happened before the wbuf was
651 } 652 * written out correctly, we must GC this segment. So assume the
652 if (ofs != area->a_used_bytes) { 653 * worst and always do the GC run.
653 printk(KERN_INFO "%x bytes unaccounted data found at %llx\n", 654 */
654 ofs - area->a_used_bytes, 655 area->a_is_open = 0;
655 dev_ofs(sb, segno, area->a_used_bytes)); 656 valid = logfs_valid_bytes(sb, segno, &ec, &gc_level);
656 area->a_used_bytes = ofs; 657 cleaned = logfs_gc_segment(sb, segno);
657 } 658 if (cleaned != valid)
659 return -EIO;
658 return 0; 660 return 0;
659} 661}
660 662
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index 33ec1aeaeec4..f602e230e162 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -6,6 +6,7 @@
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org> 6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */ 7 */
8#include "logfs.h" 8#include "logfs.h"
9#include <linux/slab.h>
9#include <linux/writeback.h> 10#include <linux/writeback.h>
10#include <linux/backing-dev.h> 11#include <linux/backing-dev.h>
11 12
@@ -192,6 +193,7 @@ static void logfs_init_inode(struct super_block *sb, struct inode *inode)
192 inode->i_ctime = CURRENT_TIME; 193 inode->i_ctime = CURRENT_TIME;
193 inode->i_mtime = CURRENT_TIME; 194 inode->i_mtime = CURRENT_TIME;
194 inode->i_nlink = 1; 195 inode->i_nlink = 1;
196 li->li_refcount = 1;
195 INIT_LIST_HEAD(&li->li_freeing_list); 197 INIT_LIST_HEAD(&li->li_freeing_list);
196 198
197 for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++) 199 for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
@@ -325,7 +327,7 @@ static void logfs_set_ino_generation(struct super_block *sb,
325 u64 ino; 327 u64 ino;
326 328
327 mutex_lock(&super->s_journal_mutex); 329 mutex_lock(&super->s_journal_mutex);
328 ino = logfs_seek_hole(super->s_master_inode, super->s_last_ino); 330 ino = logfs_seek_hole(super->s_master_inode, super->s_last_ino + 1);
329 super->s_last_ino = ino; 331 super->s_last_ino = ino;
330 super->s_inos_till_wrap--; 332 super->s_inos_till_wrap--;
331 if (super->s_inos_till_wrap < 0) { 333 if (super->s_inos_till_wrap < 0) {
@@ -356,14 +358,7 @@ struct inode *logfs_new_inode(struct inode *dir, int mode)
356 inode->i_mode = mode; 358 inode->i_mode = mode;
357 logfs_set_ino_generation(sb, inode); 359 logfs_set_ino_generation(sb, inode);
358 360
359 inode->i_uid = current_fsuid(); 361 inode_init_owner(inode, dir, mode);
360 inode->i_gid = current_fsgid();
361 if (dir->i_mode & S_ISGID) {
362 inode->i_gid = dir->i_gid;
363 if (S_ISDIR(mode))
364 inode->i_mode |= S_ISGID;
365 }
366
367 logfs_inode_setops(inode); 362 logfs_inode_setops(inode);
368 insert_inode_hash(inode); 363 insert_inode_hash(inode);
369 364
@@ -385,8 +380,7 @@ static void logfs_init_once(void *_li)
385 380
386static int logfs_sync_fs(struct super_block *sb, int wait) 381static int logfs_sync_fs(struct super_block *sb, int wait)
387{ 382{
388 /* FIXME: write anchor */ 383 logfs_write_anchor(sb);
389 logfs_super(sb)->s_devops->sync(sb);
390 return 0; 384 return 0;
391} 385}
392 386
diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c
index 6ad30a4c9052..4b0e0616b357 100644
--- a/fs/logfs/journal.c
+++ b/fs/logfs/journal.c
@@ -6,6 +6,7 @@
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org> 6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */ 7 */
8#include "logfs.h" 8#include "logfs.h"
9#include <linux/slab.h>
9 10
10static void logfs_calc_free(struct super_block *sb) 11static void logfs_calc_free(struct super_block *sb)
11{ 12{
@@ -131,10 +132,9 @@ static int read_area(struct super_block *sb, struct logfs_je_area *a)
131 132
132 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes); 133 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
133 if (super->s_writesize > 1) 134 if (super->s_writesize > 1)
134 logfs_buf_recover(area, ofs, a + 1, super->s_writesize); 135 return logfs_buf_recover(area, ofs, a + 1, super->s_writesize);
135 else 136 else
136 logfs_buf_recover(area, ofs, NULL, 0); 137 return logfs_buf_recover(area, ofs, NULL, 0);
137 return 0;
138} 138}
139 139
140static void *unpack(void *from, void *to) 140static void *unpack(void *from, void *to)
@@ -244,7 +244,7 @@ static int read_je(struct super_block *sb, u64 ofs)
244 read_erasecount(sb, unpack(jh, scratch)); 244 read_erasecount(sb, unpack(jh, scratch));
245 break; 245 break;
246 case JE_AREA: 246 case JE_AREA:
247 read_area(sb, unpack(jh, scratch)); 247 err = read_area(sb, unpack(jh, scratch));
248 break; 248 break;
249 case JE_OBJ_ALIAS: 249 case JE_OBJ_ALIAS:
250 err = logfs_load_object_aliases(sb, unpack(jh, scratch), 250 err = logfs_load_object_aliases(sb, unpack(jh, scratch),
@@ -388,7 +388,10 @@ static void journal_get_erase_count(struct logfs_area *area)
388static int journal_erase_segment(struct logfs_area *area) 388static int journal_erase_segment(struct logfs_area *area)
389{ 389{
390 struct super_block *sb = area->a_sb; 390 struct super_block *sb = area->a_sb;
391 struct logfs_segment_header sh; 391 union {
392 struct logfs_segment_header sh;
393 unsigned char c[ALIGN(sizeof(struct logfs_segment_header), 16)];
394 } u;
392 u64 ofs; 395 u64 ofs;
393 int err; 396 int err;
394 397
@@ -396,20 +399,21 @@ static int journal_erase_segment(struct logfs_area *area)
396 if (err) 399 if (err)
397 return err; 400 return err;
398 401
399 sh.pad = 0; 402 memset(&u, 0, sizeof(u));
400 sh.type = SEG_JOURNAL; 403 u.sh.pad = 0;
401 sh.level = 0; 404 u.sh.type = SEG_JOURNAL;
402 sh.segno = cpu_to_be32(area->a_segno); 405 u.sh.level = 0;
403 sh.ec = cpu_to_be32(area->a_erase_count); 406 u.sh.segno = cpu_to_be32(area->a_segno);
404 sh.gec = cpu_to_be64(logfs_super(sb)->s_gec); 407 u.sh.ec = cpu_to_be32(area->a_erase_count);
405 sh.crc = logfs_crc32(&sh, sizeof(sh), 4); 408 u.sh.gec = cpu_to_be64(logfs_super(sb)->s_gec);
409 u.sh.crc = logfs_crc32(&u.sh, sizeof(u.sh), 4);
406 410
407 /* This causes a bug in segment.c. Not yet. */ 411 /* This causes a bug in segment.c. Not yet. */
408 //logfs_set_segment_erased(sb, area->a_segno, area->a_erase_count, 0); 412 //logfs_set_segment_erased(sb, area->a_segno, area->a_erase_count, 0);
409 413
410 ofs = dev_ofs(sb, area->a_segno, 0); 414 ofs = dev_ofs(sb, area->a_segno, 0);
411 area->a_used_bytes = ALIGN(sizeof(sh), 16); 415 area->a_used_bytes = sizeof(u);
412 logfs_buf_write(area, ofs, &sh, sizeof(sh)); 416 logfs_buf_write(area, ofs, &u, sizeof(u));
413 return 0; 417 return 0;
414} 418}
415 419
@@ -493,6 +497,8 @@ static void account_shadows(struct super_block *sb)
493 497
494 btree_grim_visitor64(&tree->new, (unsigned long)sb, account_shadow); 498 btree_grim_visitor64(&tree->new, (unsigned long)sb, account_shadow);
495 btree_grim_visitor64(&tree->old, (unsigned long)sb, account_shadow); 499 btree_grim_visitor64(&tree->old, (unsigned long)sb, account_shadow);
500 btree_grim_visitor32(&tree->segment_map, 0, NULL);
501 tree->no_shadowed_segments = 0;
496 502
497 if (li->li_block) { 503 if (li->li_block) {
498 /* 504 /*
@@ -606,9 +612,9 @@ static size_t __logfs_write_je(struct super_block *sb, void *buf, u16 type,
606 if (len == 0) 612 if (len == 0)
607 return logfs_write_header(super, header, 0, type); 613 return logfs_write_header(super, header, 0, type);
608 614
615 BUG_ON(len > sb->s_blocksize);
609 compr_len = logfs_compress(buf, data, len, sb->s_blocksize); 616 compr_len = logfs_compress(buf, data, len, sb->s_blocksize);
610 if (compr_len < 0 || type == JE_ANCHOR) { 617 if (compr_len < 0 || type == JE_ANCHOR) {
611 BUG_ON(len > sb->s_blocksize);
612 memcpy(data, buf, len); 618 memcpy(data, buf, len);
613 compr_len = len; 619 compr_len = len;
614 compr = COMPR_NONE; 620 compr = COMPR_NONE;
@@ -660,6 +666,7 @@ static int logfs_write_je_buf(struct super_block *sb, void *buf, u16 type,
660 if (ofs < 0) 666 if (ofs < 0)
661 return ofs; 667 return ofs;
662 logfs_buf_write(area, ofs, super->s_compressed_je, len); 668 logfs_buf_write(area, ofs, super->s_compressed_je, len);
669 BUG_ON(super->s_no_je >= MAX_JOURNAL_ENTRIES);
663 super->s_je_array[super->s_no_je++] = cpu_to_be64(ofs); 670 super->s_je_array[super->s_no_je++] = cpu_to_be64(ofs);
664 return 0; 671 return 0;
665} 672}
@@ -800,6 +807,7 @@ void do_logfs_journal_wl_pass(struct super_block *sb)
800{ 807{
801 struct logfs_super *super = logfs_super(sb); 808 struct logfs_super *super = logfs_super(sb);
802 struct logfs_area *area = super->s_journal_area; 809 struct logfs_area *area = super->s_journal_area;
810 struct btree_head32 *head = &super->s_reserved_segments;
803 u32 segno, ec; 811 u32 segno, ec;
804 int i, err; 812 int i, err;
805 813
@@ -807,6 +815,7 @@ void do_logfs_journal_wl_pass(struct super_block *sb)
807 /* Drop old segments */ 815 /* Drop old segments */
808 journal_for_each(i) 816 journal_for_each(i)
809 if (super->s_journal_seg[i]) { 817 if (super->s_journal_seg[i]) {
818 btree_remove32(head, super->s_journal_seg[i]);
810 logfs_set_segment_unreserved(sb, 819 logfs_set_segment_unreserved(sb,
811 super->s_journal_seg[i], 820 super->s_journal_seg[i],
812 super->s_journal_ec[i]); 821 super->s_journal_ec[i]);
@@ -819,8 +828,13 @@ void do_logfs_journal_wl_pass(struct super_block *sb)
819 super->s_journal_seg[i] = segno; 828 super->s_journal_seg[i] = segno;
820 super->s_journal_ec[i] = ec; 829 super->s_journal_ec[i] = ec;
821 logfs_set_segment_reserved(sb, segno); 830 logfs_set_segment_reserved(sb, segno);
831 err = btree_insert32(head, segno, (void *)1, GFP_KERNEL);
832 BUG_ON(err); /* mempool should prevent this */
833 err = logfs_erase_segment(sb, segno, 1);
834 BUG_ON(err); /* FIXME: remount-ro would be nicer */
822 } 835 }
823 /* Manually move journal_area */ 836 /* Manually move journal_area */
837 freeseg(sb, area->a_segno);
824 area->a_segno = super->s_journal_seg[0]; 838 area->a_segno = super->s_journal_seg[0];
825 area->a_is_open = 0; 839 area->a_is_open = 0;
826 area->a_used_bytes = 0; 840 area->a_used_bytes = 0;
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index 129779431373..c838c4d72111 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -144,6 +144,7 @@ struct logfs_area_ops {
144 * @erase: erase one segment 144 * @erase: erase one segment
145 * @read: read from the device 145 * @read: read from the device
146 * @erase: erase part of the device 146 * @erase: erase part of the device
147 * @can_write_buf: decide whether wbuf can be written to ofs
147 */ 148 */
148struct logfs_device_ops { 149struct logfs_device_ops {
149 struct page *(*find_first_sb)(struct super_block *sb, u64 *ofs); 150 struct page *(*find_first_sb)(struct super_block *sb, u64 *ofs);
@@ -153,6 +154,7 @@ struct logfs_device_ops {
153 void (*writeseg)(struct super_block *sb, u64 ofs, size_t len); 154 void (*writeseg)(struct super_block *sb, u64 ofs, size_t len);
154 int (*erase)(struct super_block *sb, loff_t ofs, size_t len, 155 int (*erase)(struct super_block *sb, loff_t ofs, size_t len,
155 int ensure_write); 156 int ensure_write);
157 int (*can_write_buf)(struct super_block *sb, u64 ofs);
156 void (*sync)(struct super_block *sb); 158 void (*sync)(struct super_block *sb);
157 void (*put_device)(struct super_block *sb); 159 void (*put_device)(struct super_block *sb);
158}; 160};
@@ -257,10 +259,14 @@ struct logfs_shadow {
257 * struct shadow_tree 259 * struct shadow_tree
258 * @new: shadows where old_ofs==0, indexed by new_ofs 260 * @new: shadows where old_ofs==0, indexed by new_ofs
259 * @old: shadows where old_ofs!=0, indexed by old_ofs 261 * @old: shadows where old_ofs!=0, indexed by old_ofs
262 * @segment_map: bitfield of segments containing shadows
263 * @no_shadowed_segment: number of segments containing shadows
260 */ 264 */
261struct shadow_tree { 265struct shadow_tree {
262 struct btree_head64 new; 266 struct btree_head64 new;
263 struct btree_head64 old; 267 struct btree_head64 old;
268 struct btree_head32 segment_map;
269 int no_shadowed_segments;
264}; 270};
265 271
266struct object_alias_item { 272struct object_alias_item {
@@ -305,13 +311,14 @@ typedef int write_alias_t(struct super_block *sb, u64 ino, u64 bix,
305 level_t level, int child_no, __be64 val); 311 level_t level, int child_no, __be64 val);
306struct logfs_block_ops { 312struct logfs_block_ops {
307 void (*write_block)(struct logfs_block *block); 313 void (*write_block)(struct logfs_block *block);
308 gc_level_t (*block_level)(struct logfs_block *block);
309 void (*free_block)(struct super_block *sb, struct logfs_block*block); 314 void (*free_block)(struct super_block *sb, struct logfs_block*block);
310 int (*write_alias)(struct super_block *sb, 315 int (*write_alias)(struct super_block *sb,
311 struct logfs_block *block, 316 struct logfs_block *block,
312 write_alias_t *write_one_alias); 317 write_alias_t *write_one_alias);
313}; 318};
314 319
320#define MAX_JOURNAL_ENTRIES 256
321
315struct logfs_super { 322struct logfs_super {
316 struct mtd_info *s_mtd; /* underlying device */ 323 struct mtd_info *s_mtd; /* underlying device */
317 struct block_device *s_bdev; /* underlying device */ 324 struct block_device *s_bdev; /* underlying device */
@@ -378,7 +385,7 @@ struct logfs_super {
378 u32 s_journal_ec[LOGFS_JOURNAL_SEGS]; /* journal erasecounts */ 385 u32 s_journal_ec[LOGFS_JOURNAL_SEGS]; /* journal erasecounts */
379 u64 s_last_version; 386 u64 s_last_version;
380 struct logfs_area *s_journal_area; /* open journal segment */ 387 struct logfs_area *s_journal_area; /* open journal segment */
381 __be64 s_je_array[64]; 388 __be64 s_je_array[MAX_JOURNAL_ENTRIES];
382 int s_no_je; 389 int s_no_je;
383 390
384 int s_sum_index; /* for the 12 summaries */ 391 int s_sum_index; /* for the 12 summaries */
@@ -389,6 +396,7 @@ struct logfs_super {
389 int s_lock_count; 396 int s_lock_count;
390 mempool_t *s_block_pool; /* struct logfs_block pool */ 397 mempool_t *s_block_pool; /* struct logfs_block pool */
391 mempool_t *s_shadow_pool; /* struct logfs_shadow pool */ 398 mempool_t *s_shadow_pool; /* struct logfs_shadow pool */
399 struct list_head s_writeback_list; /* writeback pages */
392 /* 400 /*
393 * Space accounting: 401 * Space accounting:
394 * - s_used_bytes specifies space used to store valid data objects. 402 * - s_used_bytes specifies space used to store valid data objects.
@@ -498,7 +506,7 @@ extern const struct address_space_operations logfs_reg_aops;
498int logfs_readpage(struct file *file, struct page *page); 506int logfs_readpage(struct file *file, struct page *page);
499int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd, 507int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
500 unsigned long arg); 508 unsigned long arg);
501int logfs_fsync(struct file *file, struct dentry *dentry, int datasync); 509int logfs_fsync(struct file *file, int datasync);
502 510
503/* gc.c */ 511/* gc.c */
504u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec); 512u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec);
@@ -587,24 +595,25 @@ void move_page_to_btree(struct page *page);
587int logfs_init_mapping(struct super_block *sb); 595int logfs_init_mapping(struct super_block *sb);
588void logfs_sync_area(struct logfs_area *area); 596void logfs_sync_area(struct logfs_area *area);
589void logfs_sync_segments(struct super_block *sb); 597void logfs_sync_segments(struct super_block *sb);
598void freeseg(struct super_block *sb, u32 segno);
590 599
591/* area handling */ 600/* area handling */
592int logfs_init_areas(struct super_block *sb); 601int logfs_init_areas(struct super_block *sb);
593void logfs_cleanup_areas(struct super_block *sb); 602void logfs_cleanup_areas(struct super_block *sb);
594int logfs_open_area(struct logfs_area *area, size_t bytes); 603int logfs_open_area(struct logfs_area *area, size_t bytes);
595void __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len, 604int __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
596 int use_filler); 605 int use_filler);
597 606
598static inline void logfs_buf_write(struct logfs_area *area, u64 ofs, 607static inline int logfs_buf_write(struct logfs_area *area, u64 ofs,
599 void *buf, size_t len) 608 void *buf, size_t len)
600{ 609{
601 __logfs_buf_write(area, ofs, buf, len, 0); 610 return __logfs_buf_write(area, ofs, buf, len, 0);
602} 611}
603 612
604static inline void logfs_buf_recover(struct logfs_area *area, u64 ofs, 613static inline int logfs_buf_recover(struct logfs_area *area, u64 ofs,
605 void *buf, size_t len) 614 void *buf, size_t len)
606{ 615{
607 __logfs_buf_write(area, ofs, buf, len, 1); 616 return __logfs_buf_write(area, ofs, buf, len, 1);
608} 617}
609 618
610/* super.c */ 619/* super.c */
@@ -698,7 +707,7 @@ static inline gc_level_t expand_level(u64 ino, level_t __level)
698 u8 level = (__force u8)__level; 707 u8 level = (__force u8)__level;
699 708
700 if (ino == LOGFS_INO_MASTER) { 709 if (ino == LOGFS_INO_MASTER) {
701 /* ifile has seperate areas */ 710 /* ifile has separate areas */
702 level += LOGFS_MAX_LEVELS; 711 level += LOGFS_MAX_LEVELS;
703 } 712 }
704 return (__force gc_level_t)level; 713 return (__force gc_level_t)level;
@@ -721,4 +730,10 @@ static inline struct logfs_area *get_area(struct super_block *sb,
721 return logfs_super(sb)->s_area[(__force u8)gc_level]; 730 return logfs_super(sb)->s_area[(__force u8)gc_level];
722} 731}
723 732
733static inline void logfs_mempool_destroy(mempool_t *pool)
734{
735 if (pool)
736 mempool_destroy(pool);
737}
738
724#endif 739#endif
diff --git a/fs/logfs/logfs_abi.h b/fs/logfs/logfs_abi.h
index f674725663fe..ae960519c54a 100644
--- a/fs/logfs/logfs_abi.h
+++ b/fs/logfs/logfs_abi.h
@@ -50,9 +50,9 @@ static inline void check_##type(void) \
50 * 12 - gc recycled blocks, long-lived data 50 * 12 - gc recycled blocks, long-lived data
51 * 13 - replacement blocks, short-lived data 51 * 13 - replacement blocks, short-lived data
52 * 52 *
53 * Levels 1-11 are necessary for robust gc operations and help seperate 53 * Levels 1-11 are necessary for robust gc operations and help separate
54 * short-lived metadata from longer-lived file data. In the future, 54 * short-lived metadata from longer-lived file data. In the future,
55 * file data should get seperated into several segments based on simple 55 * file data should get separated into several segments based on simple
56 * heuristics. Old data recycled during gc operation is expected to be 56 * heuristics. Old data recycled during gc operation is expected to be
57 * long-lived. New data is of uncertain life expectancy. New data 57 * long-lived. New data is of uncertain life expectancy. New data
58 * used to replace older blocks in existing files is expected to be 58 * used to replace older blocks in existing files is expected to be
@@ -117,7 +117,7 @@ static inline void check_##type(void) \
117#define pure_ofs(ofs) (ofs & ~LOGFS_FULLY_POPULATED) 117#define pure_ofs(ofs) (ofs & ~LOGFS_FULLY_POPULATED)
118 118
119/* 119/*
120 * LogFS needs to seperate data into levels. Each level is defined as the 120 * LogFS needs to separate data into levels. Each level is defined as the
121 * maximal possible distance from the master inode (inode of the inode file). 121 * maximal possible distance from the master inode (inode of the inode file).
122 * Data blocks reside on level 0, 1x indirect block on level 1, etc. 122 * Data blocks reside on level 0, 1x indirect block on level 1, etc.
123 * Inodes reside on level 6, indirect blocks for the inode file on levels 7-11. 123 * Inodes reside on level 6, indirect blocks for the inode file on levels 7-11.
@@ -204,7 +204,7 @@ SIZE_CHECK(logfs_segment_header, LOGFS_SEGMENT_HEADERSIZE);
204 * @ds_crc: crc32 of structure starting with the next field 204 * @ds_crc: crc32 of structure starting with the next field
205 * @ds_ifile_levels: maximum number of levels for ifile 205 * @ds_ifile_levels: maximum number of levels for ifile
206 * @ds_iblock_levels: maximum number of levels for regular files 206 * @ds_iblock_levels: maximum number of levels for regular files
207 * @ds_data_levels: number of seperate levels for data 207 * @ds_data_levels: number of separate levels for data
208 * @pad0: reserved, must be 0 208 * @pad0: reserved, must be 0
209 * @ds_feature_incompat: incompatible filesystem features 209 * @ds_feature_incompat: incompatible filesystem features
210 * @ds_feature_ro_compat: read-only compatible filesystem features 210 * @ds_feature_ro_compat: read-only compatible filesystem features
@@ -456,7 +456,7 @@ enum logfs_vim {
456 * @vim: life expectancy of data 456 * @vim: life expectancy of data
457 * 457 *
458 * "Areas" are segments currently being used for writing. There is at least 458 * "Areas" are segments currently being used for writing. There is at least
459 * one area per GC level. Several may be used to seperate long-living from 459 * one area per GC level. Several may be used to separate long-living from
460 * short-living data. If an area with unknown vim is encountered, it can 460 * short-living data. If an area with unknown vim is encountered, it can
461 * simply be closed. 461 * simply be closed.
462 * The write buffer immediately follow this header. 462 * The write buffer immediately follow this header.
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index 7a23b3e7c0a7..0718d112a1a5 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -18,6 +18,7 @@
18 */ 18 */
19#include "logfs.h" 19#include "logfs.h"
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/slab.h>
21 22
22static u64 adjust_bix(u64 bix, level_t level) 23static u64 adjust_bix(u64 bix, level_t level)
23{ 24{
@@ -429,25 +430,6 @@ static void inode_write_block(struct logfs_block *block)
429 } 430 }
430} 431}
431 432
432static gc_level_t inode_block_level(struct logfs_block *block)
433{
434 BUG_ON(block->inode->i_ino == LOGFS_INO_MASTER);
435 return GC_LEVEL(LOGFS_MAX_LEVELS);
436}
437
438static gc_level_t indirect_block_level(struct logfs_block *block)
439{
440 struct page *page;
441 struct inode *inode;
442 u64 bix;
443 level_t level;
444
445 page = block->page;
446 inode = page->mapping->host;
447 logfs_unpack_index(page->index, &bix, &level);
448 return expand_level(inode->i_ino, level);
449}
450
451/* 433/*
452 * This silences a false, yet annoying gcc warning. I hate it when my editor 434 * This silences a false, yet annoying gcc warning. I hate it when my editor
453 * jumps into bitops.h each time I recompile this file. 435 * jumps into bitops.h each time I recompile this file.
@@ -586,14 +568,12 @@ static void indirect_free_block(struct super_block *sb,
586 568
587static struct logfs_block_ops inode_block_ops = { 569static struct logfs_block_ops inode_block_ops = {
588 .write_block = inode_write_block, 570 .write_block = inode_write_block,
589 .block_level = inode_block_level,
590 .free_block = inode_free_block, 571 .free_block = inode_free_block,
591 .write_alias = inode_write_alias, 572 .write_alias = inode_write_alias,
592}; 573};
593 574
594struct logfs_block_ops indirect_block_ops = { 575struct logfs_block_ops indirect_block_ops = {
595 .write_block = indirect_write_block, 576 .write_block = indirect_write_block,
596 .block_level = indirect_block_level,
597 .free_block = indirect_free_block, 577 .free_block = indirect_free_block,
598 .write_alias = indirect_write_alias, 578 .write_alias = indirect_write_alias,
599}; 579};
@@ -912,6 +892,8 @@ u64 logfs_seek_hole(struct inode *inode, u64 bix)
912 return bix; 892 return bix;
913 else if (li->li_data[INDIRECT_INDEX] & LOGFS_FULLY_POPULATED) 893 else if (li->li_data[INDIRECT_INDEX] & LOGFS_FULLY_POPULATED)
914 bix = maxbix(li->li_height); 894 bix = maxbix(li->li_height);
895 else if (bix >= maxbix(li->li_height))
896 return bix;
915 else { 897 else {
916 bix = seek_holedata_loop(inode, bix, 0); 898 bix = seek_holedata_loop(inode, bix, 0);
917 if (bix < maxbix(li->li_height)) 899 if (bix < maxbix(li->li_height))
@@ -1113,17 +1095,25 @@ static int logfs_reserve_bytes(struct inode *inode, int bytes)
1113int get_page_reserve(struct inode *inode, struct page *page) 1095int get_page_reserve(struct inode *inode, struct page *page)
1114{ 1096{
1115 struct logfs_super *super = logfs_super(inode->i_sb); 1097 struct logfs_super *super = logfs_super(inode->i_sb);
1098 struct logfs_block *block = logfs_block(page);
1116 int ret; 1099 int ret;
1117 1100
1118 if (logfs_block(page) && logfs_block(page)->reserved_bytes) 1101 if (block && block->reserved_bytes)
1119 return 0; 1102 return 0;
1120 1103
1121 logfs_get_wblocks(inode->i_sb, page, WF_LOCK); 1104 logfs_get_wblocks(inode->i_sb, page, WF_LOCK);
1122 ret = logfs_reserve_bytes(inode, 6 * LOGFS_MAX_OBJECTSIZE); 1105 while ((ret = logfs_reserve_bytes(inode, 6 * LOGFS_MAX_OBJECTSIZE)) &&
1106 !list_empty(&super->s_writeback_list)) {
1107 block = list_entry(super->s_writeback_list.next,
1108 struct logfs_block, alias_list);
1109 block->ops->write_block(block);
1110 }
1123 if (!ret) { 1111 if (!ret) {
1124 alloc_data_block(inode, page); 1112 alloc_data_block(inode, page);
1125 logfs_block(page)->reserved_bytes += 6 * LOGFS_MAX_OBJECTSIZE; 1113 block = logfs_block(page);
1114 block->reserved_bytes += 6 * LOGFS_MAX_OBJECTSIZE;
1126 super->s_dirty_pages += 6 * LOGFS_MAX_OBJECTSIZE; 1115 super->s_dirty_pages += 6 * LOGFS_MAX_OBJECTSIZE;
1116 list_move_tail(&block->alias_list, &super->s_writeback_list);
1127 } 1117 }
1128 logfs_put_wblocks(inode->i_sb, page, WF_LOCK); 1118 logfs_put_wblocks(inode->i_sb, page, WF_LOCK);
1129 return ret; 1119 return ret;
@@ -1240,6 +1230,18 @@ static void free_shadow(struct inode *inode, struct logfs_shadow *shadow)
1240 mempool_free(shadow, super->s_shadow_pool); 1230 mempool_free(shadow, super->s_shadow_pool);
1241} 1231}
1242 1232
1233static void mark_segment(struct shadow_tree *tree, u32 segno)
1234{
1235 int err;
1236
1237 if (!btree_lookup32(&tree->segment_map, segno)) {
1238 err = btree_insert32(&tree->segment_map, segno, (void *)1,
1239 GFP_NOFS);
1240 BUG_ON(err);
1241 tree->no_shadowed_segments++;
1242 }
1243}
1244
1243/** 1245/**
1244 * fill_shadow_tree - Propagate shadow tree changes due to a write 1246 * fill_shadow_tree - Propagate shadow tree changes due to a write
1245 * @inode: Inode owning the page 1247 * @inode: Inode owning the page
@@ -1287,6 +1289,8 @@ static void fill_shadow_tree(struct inode *inode, struct page *page,
1287 1289
1288 super->s_dirty_used_bytes += shadow->new_len; 1290 super->s_dirty_used_bytes += shadow->new_len;
1289 super->s_dirty_free_bytes += shadow->old_len; 1291 super->s_dirty_free_bytes += shadow->old_len;
1292 mark_segment(tree, shadow->old_ofs >> super->s_segshift);
1293 mark_segment(tree, shadow->new_ofs >> super->s_segshift);
1290 } 1294 }
1291} 1295}
1292 1296
@@ -1594,7 +1598,6 @@ int logfs_delete(struct inode *inode, pgoff_t index,
1594 return ret; 1598 return ret;
1595} 1599}
1596 1600
1597/* Rewrite cannot mark the inode dirty but has to write it immediatly. */
1598int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs, 1601int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs,
1599 gc_level_t gc_level, long flags) 1602 gc_level_t gc_level, long flags)
1600{ 1603{
@@ -1611,6 +1614,18 @@ int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs,
1611 if (level != 0) 1614 if (level != 0)
1612 alloc_indirect_block(inode, page, 0); 1615 alloc_indirect_block(inode, page, 0);
1613 err = logfs_write_buf(inode, page, flags); 1616 err = logfs_write_buf(inode, page, flags);
1617 if (!err && shrink_level(gc_level) == 0) {
1618 /* Rewrite cannot mark the inode dirty but has to
1619 * write it immediatly.
1620 * Q: Can't we just create an alias for the inode
1621 * instead? And if not, why not?
1622 */
1623 if (inode->i_ino == LOGFS_INO_MASTER)
1624 logfs_write_anchor(inode->i_sb);
1625 else {
1626 err = __logfs_write_inode(inode, flags);
1627 }
1628 }
1614 } 1629 }
1615 logfs_put_write_page(page); 1630 logfs_put_write_page(page);
1616 return err; 1631 return err;
@@ -1833,19 +1848,37 @@ static int __logfs_truncate(struct inode *inode, u64 size)
1833 return logfs_truncate_direct(inode, size); 1848 return logfs_truncate_direct(inode, size);
1834} 1849}
1835 1850
1836int logfs_truncate(struct inode *inode, u64 size) 1851/*
1852 * Truncate, by changing the segment file, can consume a fair amount
1853 * of resources. So back off from time to time and do some GC.
1854 * 8 or 2048 blocks should be well within safety limits even if
1855 * every single block resided in a different segment.
1856 */
1857#define TRUNCATE_STEP (8 * 1024 * 1024)
1858int logfs_truncate(struct inode *inode, u64 target)
1837{ 1859{
1838 struct super_block *sb = inode->i_sb; 1860 struct super_block *sb = inode->i_sb;
1839 int err; 1861 u64 size = i_size_read(inode);
1862 int err = 0;
1840 1863
1841 logfs_get_wblocks(sb, NULL, 1); 1864 size = ALIGN(size, TRUNCATE_STEP);
1842 err = __logfs_truncate(inode, size); 1865 while (size > target) {
1843 if (!err) 1866 if (size > TRUNCATE_STEP)
1844 err = __logfs_write_inode(inode, 0); 1867 size -= TRUNCATE_STEP;
1845 logfs_put_wblocks(sb, NULL, 1); 1868 else
1869 size = 0;
1870 if (size < target)
1871 size = target;
1872
1873 logfs_get_wblocks(sb, NULL, 1);
1874 err = __logfs_truncate(inode, size);
1875 if (!err)
1876 err = __logfs_write_inode(inode, 0);
1877 logfs_put_wblocks(sb, NULL, 1);
1878 }
1846 1879
1847 if (!err) 1880 if (!err)
1848 err = vmtruncate(inode, size); 1881 err = vmtruncate(inode, target);
1849 1882
1850 /* I don't trust error recovery yet. */ 1883 /* I don't trust error recovery yet. */
1851 WARN_ON(err); 1884 WARN_ON(err);
@@ -2226,6 +2259,7 @@ int logfs_init_rw(struct super_block *sb)
2226 int min_fill = 3 * super->s_no_blocks; 2259 int min_fill = 3 * super->s_no_blocks;
2227 2260
2228 INIT_LIST_HEAD(&super->s_object_alias); 2261 INIT_LIST_HEAD(&super->s_object_alias);
2262 INIT_LIST_HEAD(&super->s_writeback_list);
2229 mutex_init(&super->s_write_mutex); 2263 mutex_init(&super->s_write_mutex);
2230 super->s_block_pool = mempool_create_kmalloc_pool(min_fill, 2264 super->s_block_pool = mempool_create_kmalloc_pool(min_fill,
2231 sizeof(struct logfs_block)); 2265 sizeof(struct logfs_block));
@@ -2239,8 +2273,6 @@ void logfs_cleanup_rw(struct super_block *sb)
2239 struct logfs_super *super = logfs_super(sb); 2273 struct logfs_super *super = logfs_super(sb);
2240 2274
2241 destroy_meta_inode(super->s_segfile_inode); 2275 destroy_meta_inode(super->s_segfile_inode);
2242 if (super->s_block_pool) 2276 logfs_mempool_destroy(super->s_block_pool);
2243 mempool_destroy(super->s_block_pool); 2277 logfs_mempool_destroy(super->s_shadow_pool);
2244 if (super->s_shadow_pool)
2245 mempool_destroy(super->s_shadow_pool);
2246} 2278}
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
index 1a14f9910d55..a9657afb70ad 100644
--- a/fs/logfs/segment.c
+++ b/fs/logfs/segment.c
@@ -10,6 +10,7 @@
10 * three kinds of objects: inodes, dentries and blocks, both data and indirect. 10 * three kinds of objects: inodes, dentries and blocks, both data and indirect.
11 */ 11 */
12#include "logfs.h" 12#include "logfs.h"
13#include <linux/slab.h>
13 14
14static int logfs_mark_segment_bad(struct super_block *sb, u32 segno) 15static int logfs_mark_segment_bad(struct super_block *sb, u32 segno)
15{ 16{
@@ -66,7 +67,7 @@ static struct page *get_mapping_page(struct super_block *sb, pgoff_t index,
66 return page; 67 return page;
67} 68}
68 69
69void __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len, 70int __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
70 int use_filler) 71 int use_filler)
71{ 72{
72 pgoff_t index = ofs >> PAGE_SHIFT; 73 pgoff_t index = ofs >> PAGE_SHIFT;
@@ -80,8 +81,10 @@ void __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
80 copylen = min((ulong)len, PAGE_SIZE - offset); 81 copylen = min((ulong)len, PAGE_SIZE - offset);
81 82
82 page = get_mapping_page(area->a_sb, index, use_filler); 83 page = get_mapping_page(area->a_sb, index, use_filler);
83 SetPageUptodate(page); 84 if (IS_ERR(page))
85 return PTR_ERR(page);
84 BUG_ON(!page); /* FIXME: reserve a pool */ 86 BUG_ON(!page); /* FIXME: reserve a pool */
87 SetPageUptodate(page);
85 memcpy(page_address(page) + offset, buf, copylen); 88 memcpy(page_address(page) + offset, buf, copylen);
86 SetPagePrivate(page); 89 SetPagePrivate(page);
87 page_cache_release(page); 90 page_cache_release(page);
@@ -91,52 +94,61 @@ void __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
91 offset = 0; 94 offset = 0;
92 index++; 95 index++;
93 } while (len); 96 } while (len);
97 return 0;
94} 98}
95 99
96/* 100static void pad_partial_page(struct logfs_area *area)
97 * bdev_writeseg will write full pages. Memset the tail to prevent data leaks.
98 */
99static void pad_wbuf(struct logfs_area *area, int final)
100{ 101{
101 struct super_block *sb = area->a_sb; 102 struct super_block *sb = area->a_sb;
102 struct logfs_super *super = logfs_super(sb);
103 struct page *page; 103 struct page *page;
104 u64 ofs = dev_ofs(sb, area->a_segno, area->a_used_bytes); 104 u64 ofs = dev_ofs(sb, area->a_segno, area->a_used_bytes);
105 pgoff_t index = ofs >> PAGE_SHIFT; 105 pgoff_t index = ofs >> PAGE_SHIFT;
106 long offset = ofs & (PAGE_SIZE-1); 106 long offset = ofs & (PAGE_SIZE-1);
107 u32 len = PAGE_SIZE - offset; 107 u32 len = PAGE_SIZE - offset;
108 108
109 if (len == PAGE_SIZE) { 109 if (len % PAGE_SIZE) {
110 /* The math in this function can surely use some love */ 110 page = get_mapping_page(sb, index, 0);
111 len = 0;
112 }
113 if (len) {
114 BUG_ON(area->a_used_bytes >= super->s_segsize);
115
116 page = get_mapping_page(area->a_sb, index, 0);
117 BUG_ON(!page); /* FIXME: reserve a pool */ 111 BUG_ON(!page); /* FIXME: reserve a pool */
118 memset(page_address(page) + offset, 0xff, len); 112 memset(page_address(page) + offset, 0xff, len);
119 SetPagePrivate(page); 113 SetPagePrivate(page);
120 page_cache_release(page); 114 page_cache_release(page);
121 } 115 }
116}
122 117
123 if (!final) 118static void pad_full_pages(struct logfs_area *area)
124 return; 119{
120 struct super_block *sb = area->a_sb;
121 struct logfs_super *super = logfs_super(sb);
122 u64 ofs = dev_ofs(sb, area->a_segno, area->a_used_bytes);
123 u32 len = super->s_segsize - area->a_used_bytes;
124 pgoff_t index = PAGE_CACHE_ALIGN(ofs) >> PAGE_CACHE_SHIFT;
125 pgoff_t no_indizes = len >> PAGE_CACHE_SHIFT;
126 struct page *page;
125 127
126 area->a_used_bytes += len; 128 while (no_indizes) {
127 for ( ; area->a_used_bytes < super->s_segsize; 129 page = get_mapping_page(sb, index, 0);
128 area->a_used_bytes += PAGE_SIZE) {
129 /* Memset another page */
130 index++;
131 page = get_mapping_page(area->a_sb, index, 0);
132 BUG_ON(!page); /* FIXME: reserve a pool */ 130 BUG_ON(!page); /* FIXME: reserve a pool */
133 memset(page_address(page), 0xff, PAGE_SIZE); 131 SetPageUptodate(page);
132 memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
134 SetPagePrivate(page); 133 SetPagePrivate(page);
135 page_cache_release(page); 134 page_cache_release(page);
135 index++;
136 no_indizes--;
136 } 137 }
137} 138}
138 139
139/* 140/*
141 * bdev_writeseg will write full pages. Memset the tail to prevent data leaks.
142 * Also make sure we allocate (and memset) all pages for final writeout.
143 */
144static void pad_wbuf(struct logfs_area *area, int final)
145{
146 pad_partial_page(area);
147 if (final)
148 pad_full_pages(area);
149}
150
151/*
140 * We have to be careful with the alias tree. Since lookup is done by bix, 152 * We have to be careful with the alias tree. Since lookup is done by bix,
141 * it needs to be normalized, so 14, 15, 16, etc. all match when dealing with 153 * it needs to be normalized, so 14, 15, 16, etc. all match when dealing with
142 * indirect blocks. So always use it through accessor functions. 154 * indirect blocks. So always use it through accessor functions.
@@ -174,14 +186,8 @@ static int btree_write_alias(struct super_block *sb, struct logfs_block *block,
174 return 0; 186 return 0;
175} 187}
176 188
177static gc_level_t btree_block_level(struct logfs_block *block)
178{
179 return expand_level(block->ino, block->level);
180}
181
182static struct logfs_block_ops btree_block_ops = { 189static struct logfs_block_ops btree_block_ops = {
183 .write_block = btree_write_block, 190 .write_block = btree_write_block,
184 .block_level = btree_block_level,
185 .free_block = __free_block, 191 .free_block = __free_block,
186 .write_alias = btree_write_alias, 192 .write_alias = btree_write_alias,
187}; 193};
@@ -683,7 +689,7 @@ int logfs_segment_delete(struct inode *inode, struct logfs_shadow *shadow)
683 return 0; 689 return 0;
684} 690}
685 691
686static void freeseg(struct super_block *sb, u32 segno) 692void freeseg(struct super_block *sb, u32 segno)
687{ 693{
688 struct logfs_super *super = logfs_super(sb); 694 struct logfs_super *super = logfs_super(sb);
689 struct address_space *mapping = super->s_mapping_inode->i_mapping; 695 struct address_space *mapping = super->s_mapping_inode->i_mapping;
@@ -910,7 +916,7 @@ err:
910 for (i--; i >= 0; i--) 916 for (i--; i >= 0; i--)
911 free_area(super->s_area[i]); 917 free_area(super->s_area[i]);
912 free_area(super->s_journal_area); 918 free_area(super->s_journal_area);
913 mempool_destroy(super->s_alias_pool); 919 logfs_mempool_destroy(super->s_alias_pool);
914 return -ENOMEM; 920 return -ENOMEM;
915} 921}
916 922
diff --git a/fs/logfs/super.c b/fs/logfs/super.c
index c66beab78dee..d651e10a1e9c 100644
--- a/fs/logfs/super.c
+++ b/fs/logfs/super.c
@@ -11,6 +11,8 @@
11 */ 11 */
12#include "logfs.h" 12#include "logfs.h"
13#include <linux/bio.h> 13#include <linux/bio.h>
14#include <linux/slab.h>
15#include <linux/blkdev.h>
14#include <linux/mtd/mtd.h> 16#include <linux/mtd/mtd.h>
15#include <linux/statfs.h> 17#include <linux/statfs.h>
16#include <linux/buffer_head.h> 18#include <linux/buffer_head.h>
@@ -136,6 +138,14 @@ static int logfs_sb_set(struct super_block *sb, void *_super)
136 sb->s_fs_info = super; 138 sb->s_fs_info = super;
137 sb->s_mtd = super->s_mtd; 139 sb->s_mtd = super->s_mtd;
138 sb->s_bdev = super->s_bdev; 140 sb->s_bdev = super->s_bdev;
141#ifdef CONFIG_BLOCK
142 if (sb->s_bdev)
143 sb->s_bdi = &bdev_get_queue(sb->s_bdev)->backing_dev_info;
144#endif
145#ifdef CONFIG_MTD
146 if (sb->s_mtd)
147 sb->s_bdi = sb->s_mtd->backing_dev_info;
148#endif
139 return 0; 149 return 0;
140} 150}
141 151
@@ -277,7 +287,7 @@ static int logfs_recover_sb(struct super_block *sb)
277 } 287 }
278 if (valid0 && valid1 && ds_cmp(ds0, ds1)) { 288 if (valid0 && valid1 && ds_cmp(ds0, ds1)) {
279 printk(KERN_INFO"Superblocks don't match - fixing.\n"); 289 printk(KERN_INFO"Superblocks don't match - fixing.\n");
280 return write_one_sb(sb, super->s_devops->find_last_sb); 290 return logfs_write_sb(sb);
281 } 291 }
282 /* If neither is valid now, something's wrong. Didn't we properly 292 /* If neither is valid now, something's wrong. Didn't we properly
283 * check them before?!? */ 293 * check them before?!? */
@@ -289,6 +299,10 @@ static int logfs_make_writeable(struct super_block *sb)
289{ 299{
290 int err; 300 int err;
291 301
302 err = logfs_open_segfile(sb);
303 if (err)
304 return err;
305
292 /* Repair any broken superblock copies */ 306 /* Repair any broken superblock copies */
293 err = logfs_recover_sb(sb); 307 err = logfs_recover_sb(sb);
294 if (err) 308 if (err)
@@ -299,10 +313,6 @@ static int logfs_make_writeable(struct super_block *sb)
299 if (err) 313 if (err)
300 return err; 314 return err;
301 315
302 err = logfs_open_segfile(sb);
303 if (err)
304 return err;
305
306 /* Do one GC pass before any data gets dirtied */ 316 /* Do one GC pass before any data gets dirtied */
307 logfs_gc_pass(sb); 317 logfs_gc_pass(sb);
308 318
@@ -327,27 +337,27 @@ static int logfs_get_sb_final(struct super_block *sb, struct vfsmount *mnt)
327 goto fail; 337 goto fail;
328 338
329 sb->s_root = d_alloc_root(rootdir); 339 sb->s_root = d_alloc_root(rootdir);
330 if (!sb->s_root) 340 if (!sb->s_root) {
341 iput(rootdir);
331 goto fail; 342 goto fail;
343 }
332 344
333 super->s_erase_page = alloc_pages(GFP_KERNEL, 0); 345 super->s_erase_page = alloc_pages(GFP_KERNEL, 0);
334 if (!super->s_erase_page) 346 if (!super->s_erase_page)
335 goto fail2; 347 goto fail;
336 memset(page_address(super->s_erase_page), 0xFF, PAGE_SIZE); 348 memset(page_address(super->s_erase_page), 0xFF, PAGE_SIZE);
337 349
338 /* FIXME: check for read-only mounts */ 350 /* FIXME: check for read-only mounts */
339 err = logfs_make_writeable(sb); 351 err = logfs_make_writeable(sb);
340 if (err) 352 if (err)
341 goto fail3; 353 goto fail1;
342 354
343 log_super("LogFS: Finished mounting\n"); 355 log_super("LogFS: Finished mounting\n");
344 simple_set_mnt(mnt, sb); 356 simple_set_mnt(mnt, sb);
345 return 0; 357 return 0;
346 358
347fail3: 359fail1:
348 __free_page(super->s_erase_page); 360 __free_page(super->s_erase_page);
349fail2:
350 iput(rootdir);
351fail: 361fail:
352 iput(logfs_super(sb)->s_master_inode); 362 iput(logfs_super(sb)->s_master_inode);
353 return -EIO; 363 return -EIO;
@@ -376,7 +386,7 @@ static struct page *find_super_block(struct super_block *sb)
376 if (!first || IS_ERR(first)) 386 if (!first || IS_ERR(first))
377 return NULL; 387 return NULL;
378 last = super->s_devops->find_last_sb(sb, &super->s_sb_ofs[1]); 388 last = super->s_devops->find_last_sb(sb, &super->s_sb_ofs[1]);
379 if (!last || IS_ERR(first)) { 389 if (!last || IS_ERR(last)) {
380 page_cache_release(first); 390 page_cache_release(first);
381 return NULL; 391 return NULL;
382 } 392 }
@@ -407,7 +417,7 @@ static int __logfs_read_sb(struct super_block *sb)
407 417
408 page = find_super_block(sb); 418 page = find_super_block(sb);
409 if (!page) 419 if (!page)
410 return -EIO; 420 return -EINVAL;
411 421
412 ds = page_address(page); 422 ds = page_address(page);
413 super->s_size = be64_to_cpu(ds->ds_filesystem_size); 423 super->s_size = be64_to_cpu(ds->ds_filesystem_size);
@@ -451,6 +461,8 @@ static int logfs_read_sb(struct super_block *sb, int read_only)
451 461
452 btree_init_mempool64(&super->s_shadow_tree.new, super->s_btree_pool); 462 btree_init_mempool64(&super->s_shadow_tree.new, super->s_btree_pool);
453 btree_init_mempool64(&super->s_shadow_tree.old, super->s_btree_pool); 463 btree_init_mempool64(&super->s_shadow_tree.old, super->s_btree_pool);
464 btree_init_mempool32(&super->s_shadow_tree.segment_map,
465 super->s_btree_pool);
454 466
455 ret = logfs_init_mapping(sb); 467 ret = logfs_init_mapping(sb);
456 if (ret) 468 if (ret)
@@ -515,8 +527,8 @@ static void logfs_kill_sb(struct super_block *sb)
515 if (super->s_erase_page) 527 if (super->s_erase_page)
516 __free_page(super->s_erase_page); 528 __free_page(super->s_erase_page);
517 super->s_devops->put_device(sb); 529 super->s_devops->put_device(sb);
518 mempool_destroy(super->s_btree_pool); 530 logfs_mempool_destroy(super->s_btree_pool);
519 mempool_destroy(super->s_alias_pool); 531 logfs_mempool_destroy(super->s_alias_pool);
520 kfree(super); 532 kfree(super);
521 log_super("LogFS: Finished unmounting\n"); 533 log_super("LogFS: Finished unmounting\n");
522} 534}
@@ -572,8 +584,7 @@ int logfs_get_sb_device(struct file_system_type *type, int flags,
572 return 0; 584 return 0;
573 585
574err1: 586err1:
575 up_write(&sb->s_umount); 587 deactivate_locked_super(sb);
576 deactivate_super(sb);
577 return err; 588 return err;
578err0: 589err0:
579 kfree(super); 590 kfree(super);
diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c
index 6ac693faae49..482779fe4e7c 100644
--- a/fs/minix/bitmap.c
+++ b/fs/minix/bitmap.c
@@ -221,7 +221,7 @@ void minix_free_inode(struct inode * inode)
221 clear_inode(inode); /* clear in-memory copy */ 221 clear_inode(inode); /* clear in-memory copy */
222} 222}
223 223
224struct inode * minix_new_inode(const struct inode * dir, int * error) 224struct inode *minix_new_inode(const struct inode *dir, int mode, int *error)
225{ 225{
226 struct super_block *sb = dir->i_sb; 226 struct super_block *sb = dir->i_sb;
227 struct minix_sb_info *sbi = minix_sb(sb); 227 struct minix_sb_info *sbi = minix_sb(sb);
@@ -263,8 +263,7 @@ struct inode * minix_new_inode(const struct inode * dir, int * error)
263 iput(inode); 263 iput(inode);
264 return NULL; 264 return NULL;
265 } 265 }
266 inode->i_uid = current_fsuid(); 266 inode_init_owner(inode, dir, mode);
267 inode->i_gid = (dir->i_mode & S_ISGID) ? dir->i_gid : current_fsgid();
268 inode->i_ino = j; 267 inode->i_ino = j;
269 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; 268 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
270 inode->i_blocks = 0; 269 inode->i_blocks = 0;
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index 6198731d7fcd..91969589131c 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -22,7 +22,7 @@ const struct file_operations minix_dir_operations = {
22 .llseek = generic_file_llseek, 22 .llseek = generic_file_llseek,
23 .read = generic_read_dir, 23 .read = generic_read_dir,
24 .readdir = minix_readdir, 24 .readdir = minix_readdir,
25 .fsync = simple_fsync, 25 .fsync = generic_file_fsync,
26}; 26};
27 27
28static inline void dir_put_page(struct page *page) 28static inline void dir_put_page(struct page *page)
@@ -72,11 +72,8 @@ static struct page * dir_get_page(struct inode *dir, unsigned long n)
72{ 72{
73 struct address_space *mapping = dir->i_mapping; 73 struct address_space *mapping = dir->i_mapping;
74 struct page *page = read_mapping_page(mapping, n, NULL); 74 struct page *page = read_mapping_page(mapping, n, NULL);
75 if (!IS_ERR(page)) { 75 if (!IS_ERR(page))
76 kmap(page); 76 kmap(page);
77 if (!PageUptodate(page))
78 goto fail;
79 }
80 return page; 77 return page;
81 78
82fail: 79fail:
diff --git a/fs/minix/file.c b/fs/minix/file.c
index 3eec3e607a87..d5320ff23faf 100644
--- a/fs/minix/file.c
+++ b/fs/minix/file.c
@@ -19,7 +19,7 @@ const struct file_operations minix_file_operations = {
19 .write = do_sync_write, 19 .write = do_sync_write,
20 .aio_write = generic_file_aio_write, 20 .aio_write = generic_file_aio_write,
21 .mmap = generic_file_mmap, 21 .mmap = generic_file_mmap,
22 .fsync = simple_fsync, 22 .fsync = generic_file_fsync,
23 .splice_read = generic_file_splice_read, 23 .splice_read = generic_file_splice_read,
24}; 24};
25 25
diff --git a/fs/minix/itree_v1.c b/fs/minix/itree_v1.c
index 82d6554b02fe..282e15ad8cd8 100644
--- a/fs/minix/itree_v1.c
+++ b/fs/minix/itree_v1.c
@@ -1,4 +1,5 @@
1#include <linux/buffer_head.h> 1#include <linux/buffer_head.h>
2#include <linux/slab.h>
2#include "minix.h" 3#include "minix.h"
3 4
4enum {DEPTH = 3, DIRECT = 7}; /* Only double indirect */ 5enum {DEPTH = 3, DIRECT = 7}; /* Only double indirect */
diff --git a/fs/minix/itree_v2.c b/fs/minix/itree_v2.c
index f23010969369..13487ad16894 100644
--- a/fs/minix/itree_v2.c
+++ b/fs/minix/itree_v2.c
@@ -20,6 +20,9 @@ static inline block_t *i_data(struct inode *inode)
20 return (block_t *)minix_i(inode)->u.i2_data; 20 return (block_t *)minix_i(inode)->u.i2_data;
21} 21}
22 22
23#define DIRCOUNT 7
24#define INDIRCOUNT(sb) (1 << ((sb)->s_blocksize_bits - 2))
25
23static int block_to_path(struct inode * inode, long block, int offsets[DEPTH]) 26static int block_to_path(struct inode * inode, long block, int offsets[DEPTH])
24{ 27{
25 int n = 0; 28 int n = 0;
@@ -34,21 +37,21 @@ static int block_to_path(struct inode * inode, long block, int offsets[DEPTH])
34 printk("MINIX-fs: block_to_path: " 37 printk("MINIX-fs: block_to_path: "
35 "block %ld too big on dev %s\n", 38 "block %ld too big on dev %s\n",
36 block, bdevname(sb->s_bdev, b)); 39 block, bdevname(sb->s_bdev, b));
37 } else if (block < 7) { 40 } else if (block < DIRCOUNT) {
38 offsets[n++] = block; 41 offsets[n++] = block;
39 } else if ((block -= 7) < 256) { 42 } else if ((block -= DIRCOUNT) < INDIRCOUNT(sb)) {
40 offsets[n++] = 7; 43 offsets[n++] = DIRCOUNT;
41 offsets[n++] = block; 44 offsets[n++] = block;
42 } else if ((block -= 256) < 256*256) { 45 } else if ((block -= INDIRCOUNT(sb)) < INDIRCOUNT(sb) * INDIRCOUNT(sb)) {
43 offsets[n++] = 8; 46 offsets[n++] = DIRCOUNT + 1;
44 offsets[n++] = block>>8; 47 offsets[n++] = block / INDIRCOUNT(sb);
45 offsets[n++] = block & 255; 48 offsets[n++] = block % INDIRCOUNT(sb);
46 } else { 49 } else {
47 block -= 256*256; 50 block -= INDIRCOUNT(sb) * INDIRCOUNT(sb);
48 offsets[n++] = 9; 51 offsets[n++] = DIRCOUNT + 2;
49 offsets[n++] = block>>16; 52 offsets[n++] = (block / INDIRCOUNT(sb)) / INDIRCOUNT(sb);
50 offsets[n++] = (block>>8) & 255; 53 offsets[n++] = (block / INDIRCOUNT(sb)) % INDIRCOUNT(sb);
51 offsets[n++] = block & 255; 54 offsets[n++] = block % INDIRCOUNT(sb);
52 } 55 }
53 return n; 56 return n;
54} 57}
diff --git a/fs/minix/minix.h b/fs/minix/minix.h
index 9dcf95b42116..111f34ee9e3b 100644
--- a/fs/minix/minix.h
+++ b/fs/minix/minix.h
@@ -46,7 +46,7 @@ struct minix_sb_info {
46extern struct inode *minix_iget(struct super_block *, unsigned long); 46extern struct inode *minix_iget(struct super_block *, unsigned long);
47extern struct minix_inode * minix_V1_raw_inode(struct super_block *, ino_t, struct buffer_head **); 47extern struct minix_inode * minix_V1_raw_inode(struct super_block *, ino_t, struct buffer_head **);
48extern struct minix2_inode * minix_V2_raw_inode(struct super_block *, ino_t, struct buffer_head **); 48extern struct minix2_inode * minix_V2_raw_inode(struct super_block *, ino_t, struct buffer_head **);
49extern struct inode * minix_new_inode(const struct inode * dir, int * error); 49extern struct inode * minix_new_inode(const struct inode *, int, int *);
50extern void minix_free_inode(struct inode * inode); 50extern void minix_free_inode(struct inode * inode);
51extern unsigned long minix_count_free_inodes(struct minix_sb_info *sbi); 51extern unsigned long minix_count_free_inodes(struct minix_sb_info *sbi);
52extern int minix_new_block(struct inode * inode); 52extern int minix_new_block(struct inode * inode);
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index 32b131cd6121..e20ee85955d1 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -46,10 +46,9 @@ static int minix_mknod(struct inode * dir, struct dentry *dentry, int mode, dev_
46 if (!old_valid_dev(rdev)) 46 if (!old_valid_dev(rdev))
47 return -EINVAL; 47 return -EINVAL;
48 48
49 inode = minix_new_inode(dir, &error); 49 inode = minix_new_inode(dir, mode, &error);
50 50
51 if (inode) { 51 if (inode) {
52 inode->i_mode = mode;
53 minix_set_inode(inode, rdev); 52 minix_set_inode(inode, rdev);
54 mark_inode_dirty(inode); 53 mark_inode_dirty(inode);
55 error = add_nondir(dentry, inode); 54 error = add_nondir(dentry, inode);
@@ -73,11 +72,10 @@ static int minix_symlink(struct inode * dir, struct dentry *dentry,
73 if (i > dir->i_sb->s_blocksize) 72 if (i > dir->i_sb->s_blocksize)
74 goto out; 73 goto out;
75 74
76 inode = minix_new_inode(dir, &err); 75 inode = minix_new_inode(dir, S_IFLNK | 0777, &err);
77 if (!inode) 76 if (!inode)
78 goto out; 77 goto out;
79 78
80 inode->i_mode = S_IFLNK | 0777;
81 minix_set_inode(inode, 0); 79 minix_set_inode(inode, 0);
82 err = page_symlink(inode, symname, i); 80 err = page_symlink(inode, symname, i);
83 if (err) 81 if (err)
@@ -117,13 +115,10 @@ static int minix_mkdir(struct inode * dir, struct dentry *dentry, int mode)
117 115
118 inode_inc_link_count(dir); 116 inode_inc_link_count(dir);
119 117
120 inode = minix_new_inode(dir, &err); 118 inode = minix_new_inode(dir, mode, &err);
121 if (!inode) 119 if (!inode)
122 goto out_dir; 120 goto out_dir;
123 121
124 inode->i_mode = S_IFDIR | mode;
125 if (dir->i_mode & S_ISGID)
126 inode->i_mode |= S_ISGID;
127 minix_set_inode(inode, 0); 122 minix_set_inode(inode, 0);
128 123
129 inode_inc_link_count(inode); 124 inode_inc_link_count(inode);
diff --git a/fs/mpage.c b/fs/mpage.c
index 598d54e200eb..fd56ca2ea556 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -16,6 +16,7 @@
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/mm.h> 17#include <linux/mm.h>
18#include <linux/kdev_t.h> 18#include <linux/kdev_t.h>
19#include <linux/gfp.h>
19#include <linux/bio.h> 20#include <linux/bio.h>
20#include <linux/fs.h> 21#include <linux/fs.h>
21#include <linux/buffer_head.h> 22#include <linux/buffer_head.h>
diff --git a/fs/namei.c b/fs/namei.c
index 1c0fca6e899e..868d0cb9d473 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -523,9 +523,10 @@ static void path_put_conditional(struct path *path, struct nameidata *nd)
523static inline void path_to_nameidata(struct path *path, struct nameidata *nd) 523static inline void path_to_nameidata(struct path *path, struct nameidata *nd)
524{ 524{
525 dput(nd->path.dentry); 525 dput(nd->path.dentry);
526 if (nd->path.mnt != path->mnt) 526 if (nd->path.mnt != path->mnt) {
527 mntput(nd->path.mnt); 527 mntput(nd->path.mnt);
528 nd->path.mnt = path->mnt; 528 nd->path.mnt = path->mnt;
529 }
529 nd->path.dentry = path->dentry; 530 nd->path.dentry = path->dentry;
530} 531}
531 532
@@ -1610,8 +1611,7 @@ exit:
1610 1611
1611static struct file *do_last(struct nameidata *nd, struct path *path, 1612static struct file *do_last(struct nameidata *nd, struct path *path,
1612 int open_flag, int acc_mode, 1613 int open_flag, int acc_mode,
1613 int mode, const char *pathname, 1614 int mode, const char *pathname)
1614 int *want_dir)
1615{ 1615{
1616 struct dentry *dir = nd->path.dentry; 1616 struct dentry *dir = nd->path.dentry;
1617 struct file *filp; 1617 struct file *filp;
@@ -1621,6 +1621,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
1621 case LAST_DOTDOT: 1621 case LAST_DOTDOT:
1622 follow_dotdot(nd); 1622 follow_dotdot(nd);
1623 dir = nd->path.dentry; 1623 dir = nd->path.dentry;
1624 case LAST_DOT:
1624 if (nd->path.mnt->mnt_sb->s_type->fs_flags & FS_REVAL_DOT) { 1625 if (nd->path.mnt->mnt_sb->s_type->fs_flags & FS_REVAL_DOT) {
1625 if (!dir->d_op->d_revalidate(dir, nd)) { 1626 if (!dir->d_op->d_revalidate(dir, nd)) {
1626 error = -ESTALE; 1627 error = -ESTALE;
@@ -1628,7 +1629,6 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
1628 } 1629 }
1629 } 1630 }
1630 /* fallthrough */ 1631 /* fallthrough */
1631 case LAST_DOT:
1632 case LAST_ROOT: 1632 case LAST_ROOT:
1633 if (open_flag & O_CREAT) 1633 if (open_flag & O_CREAT)
1634 goto exit; 1634 goto exit;
@@ -1642,7 +1642,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
1642 if (nd->last.name[nd->last.len]) { 1642 if (nd->last.name[nd->last.len]) {
1643 if (open_flag & O_CREAT) 1643 if (open_flag & O_CREAT)
1644 goto exit; 1644 goto exit;
1645 *want_dir = 1; 1645 nd->flags |= LOOKUP_DIRECTORY | LOOKUP_FOLLOW;
1646 } 1646 }
1647 1647
1648 /* just plain open? */ 1648 /* just plain open? */
@@ -1656,8 +1656,10 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
1656 if (path->dentry->d_inode->i_op->follow_link) 1656 if (path->dentry->d_inode->i_op->follow_link)
1657 return NULL; 1657 return NULL;
1658 error = -ENOTDIR; 1658 error = -ENOTDIR;
1659 if (*want_dir && !path->dentry->d_inode->i_op->lookup) 1659 if (nd->flags & LOOKUP_DIRECTORY) {
1660 goto exit_dput; 1660 if (!path->dentry->d_inode->i_op->lookup)
1661 goto exit_dput;
1662 }
1661 path_to_nameidata(path, nd); 1663 path_to_nameidata(path, nd);
1662 audit_inode(pathname, nd->path.dentry); 1664 audit_inode(pathname, nd->path.dentry);
1663 goto ok; 1665 goto ok;
@@ -1766,7 +1768,6 @@ struct file *do_filp_open(int dfd, const char *pathname,
1766 int count = 0; 1768 int count = 0;
1767 int flag = open_to_namei_flags(open_flag); 1769 int flag = open_to_namei_flags(open_flag);
1768 int force_reval = 0; 1770 int force_reval = 0;
1769 int want_dir = open_flag & O_DIRECTORY;
1770 1771
1771 if (!(open_flag & O_CREAT)) 1772 if (!(open_flag & O_CREAT))
1772 mode = 0; 1773 mode = 0;
@@ -1828,14 +1829,18 @@ reval:
1828 if (open_flag & O_EXCL) 1829 if (open_flag & O_EXCL)
1829 nd.flags |= LOOKUP_EXCL; 1830 nd.flags |= LOOKUP_EXCL;
1830 } 1831 }
1831 filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname, &want_dir); 1832 if (open_flag & O_DIRECTORY)
1833 nd.flags |= LOOKUP_DIRECTORY;
1834 if (!(open_flag & O_NOFOLLOW))
1835 nd.flags |= LOOKUP_FOLLOW;
1836 filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
1832 while (unlikely(!filp)) { /* trailing symlink */ 1837 while (unlikely(!filp)) { /* trailing symlink */
1833 struct path holder; 1838 struct path holder;
1834 struct inode *inode = path.dentry->d_inode; 1839 struct inode *inode = path.dentry->d_inode;
1835 void *cookie; 1840 void *cookie;
1836 error = -ELOOP; 1841 error = -ELOOP;
1837 /* S_ISDIR part is a temporary automount kludge */ 1842 /* S_ISDIR part is a temporary automount kludge */
1838 if ((open_flag & O_NOFOLLOW) && !S_ISDIR(inode->i_mode)) 1843 if (!(nd.flags & LOOKUP_FOLLOW) && !S_ISDIR(inode->i_mode))
1839 goto exit_dput; 1844 goto exit_dput;
1840 if (count++ == 32) 1845 if (count++ == 32)
1841 goto exit_dput; 1846 goto exit_dput;
@@ -1866,7 +1871,7 @@ reval:
1866 } 1871 }
1867 holder = path; 1872 holder = path;
1868 nd.flags &= ~LOOKUP_PARENT; 1873 nd.flags &= ~LOOKUP_PARENT;
1869 filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname, &want_dir); 1874 filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
1870 if (inode->i_op->put_link) 1875 if (inode->i_op->put_link)
1871 inode->i_op->put_link(holder.dentry, &nd, cookie); 1876 inode->i_op->put_link(holder.dentry, &nd, cookie);
1872 path_put(&holder); 1877 path_put(&holder);
@@ -2172,8 +2177,10 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
2172 error = security_inode_rmdir(dir, dentry); 2177 error = security_inode_rmdir(dir, dentry);
2173 if (!error) { 2178 if (!error) {
2174 error = dir->i_op->rmdir(dir, dentry); 2179 error = dir->i_op->rmdir(dir, dentry);
2175 if (!error) 2180 if (!error) {
2176 dentry->d_inode->i_flags |= S_DEAD; 2181 dentry->d_inode->i_flags |= S_DEAD;
2182 dont_mount(dentry);
2183 }
2177 } 2184 }
2178 } 2185 }
2179 mutex_unlock(&dentry->d_inode->i_mutex); 2186 mutex_unlock(&dentry->d_inode->i_mutex);
@@ -2257,7 +2264,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry)
2257 if (!error) { 2264 if (!error) {
2258 error = dir->i_op->unlink(dir, dentry); 2265 error = dir->i_op->unlink(dir, dentry);
2259 if (!error) 2266 if (!error)
2260 dentry->d_inode->i_flags |= S_DEAD; 2267 dont_mount(dentry);
2261 } 2268 }
2262 } 2269 }
2263 mutex_unlock(&dentry->d_inode->i_mutex); 2270 mutex_unlock(&dentry->d_inode->i_mutex);
@@ -2568,17 +2575,20 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
2568 return error; 2575 return error;
2569 2576
2570 target = new_dentry->d_inode; 2577 target = new_dentry->d_inode;
2571 if (target) { 2578 if (target)
2572 mutex_lock(&target->i_mutex); 2579 mutex_lock(&target->i_mutex);
2573 dentry_unhash(new_dentry);
2574 }
2575 if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry)) 2580 if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
2576 error = -EBUSY; 2581 error = -EBUSY;
2577 else 2582 else {
2583 if (target)
2584 dentry_unhash(new_dentry);
2578 error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); 2585 error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
2586 }
2579 if (target) { 2587 if (target) {
2580 if (!error) 2588 if (!error) {
2581 target->i_flags |= S_DEAD; 2589 target->i_flags |= S_DEAD;
2590 dont_mount(new_dentry);
2591 }
2582 mutex_unlock(&target->i_mutex); 2592 mutex_unlock(&target->i_mutex);
2583 if (d_unhashed(new_dentry)) 2593 if (d_unhashed(new_dentry))
2584 d_rehash(new_dentry); 2594 d_rehash(new_dentry);
@@ -2610,7 +2620,7 @@ static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
2610 error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); 2620 error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
2611 if (!error) { 2621 if (!error) {
2612 if (target) 2622 if (target)
2613 target->i_flags |= S_DEAD; 2623 dont_mount(new_dentry);
2614 if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) 2624 if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
2615 d_move(old_dentry, new_dentry); 2625 d_move(old_dentry, new_dentry);
2616 } 2626 }
diff --git a/fs/namespace.c b/fs/namespace.c
index 8174c8ab5c70..88058de59c7c 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -628,7 +628,6 @@ repeat:
628 mnt->mnt_pinned = 0; 628 mnt->mnt_pinned = 0;
629 spin_unlock(&vfsmount_lock); 629 spin_unlock(&vfsmount_lock);
630 acct_auto_close_mnt(mnt); 630 acct_auto_close_mnt(mnt);
631 security_sb_umount_close(mnt);
632 goto repeat; 631 goto repeat;
633 } 632 }
634} 633}
@@ -1117,8 +1116,6 @@ static int do_umount(struct vfsmount *mnt, int flags)
1117 retval = 0; 1116 retval = 0;
1118 } 1117 }
1119 spin_unlock(&vfsmount_lock); 1118 spin_unlock(&vfsmount_lock);
1120 if (retval)
1121 security_sb_umount_busy(mnt);
1122 up_write(&namespace_sem); 1119 up_write(&namespace_sem);
1123 release_mounts(&umount_list); 1120 release_mounts(&umount_list);
1124 return retval; 1121 return retval;
@@ -1432,20 +1429,13 @@ static int graft_tree(struct vfsmount *mnt, struct path *path)
1432 1429
1433 err = -ENOENT; 1430 err = -ENOENT;
1434 mutex_lock(&path->dentry->d_inode->i_mutex); 1431 mutex_lock(&path->dentry->d_inode->i_mutex);
1435 if (IS_DEADDIR(path->dentry->d_inode)) 1432 if (cant_mount(path->dentry))
1436 goto out_unlock;
1437
1438 err = security_sb_check_sb(mnt, path);
1439 if (err)
1440 goto out_unlock; 1433 goto out_unlock;
1441 1434
1442 err = -ENOENT;
1443 if (!d_unlinked(path->dentry)) 1435 if (!d_unlinked(path->dentry))
1444 err = attach_recursive_mnt(mnt, path, NULL); 1436 err = attach_recursive_mnt(mnt, path, NULL);
1445out_unlock: 1437out_unlock:
1446 mutex_unlock(&path->dentry->d_inode->i_mutex); 1438 mutex_unlock(&path->dentry->d_inode->i_mutex);
1447 if (!err)
1448 security_sb_post_addmount(mnt, path);
1449 return err; 1439 return err;
1450} 1440}
1451 1441
@@ -1581,8 +1571,6 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
1581 } 1571 }
1582 up_write(&sb->s_umount); 1572 up_write(&sb->s_umount);
1583 if (!err) { 1573 if (!err) {
1584 security_sb_post_remount(path->mnt, flags, data);
1585
1586 spin_lock(&vfsmount_lock); 1574 spin_lock(&vfsmount_lock);
1587 touch_mnt_namespace(path->mnt->mnt_ns); 1575 touch_mnt_namespace(path->mnt->mnt_ns);
1588 spin_unlock(&vfsmount_lock); 1576 spin_unlock(&vfsmount_lock);
@@ -1623,7 +1611,7 @@ static int do_move_mount(struct path *path, char *old_name)
1623 1611
1624 err = -ENOENT; 1612 err = -ENOENT;
1625 mutex_lock(&path->dentry->d_inode->i_mutex); 1613 mutex_lock(&path->dentry->d_inode->i_mutex);
1626 if (IS_DEADDIR(path->dentry->d_inode)) 1614 if (cant_mount(path->dentry))
1627 goto out1; 1615 goto out1;
1628 1616
1629 if (d_unlinked(path->dentry)) 1617 if (d_unlinked(path->dentry))
@@ -2234,7 +2222,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
2234 if (!check_mnt(root.mnt)) 2222 if (!check_mnt(root.mnt))
2235 goto out2; 2223 goto out2;
2236 error = -ENOENT; 2224 error = -ENOENT;
2237 if (IS_DEADDIR(new.dentry->d_inode)) 2225 if (cant_mount(old.dentry))
2238 goto out2; 2226 goto out2;
2239 if (d_unlinked(new.dentry)) 2227 if (d_unlinked(new.dentry))
2240 goto out2; 2228 goto out2;
@@ -2277,7 +2265,6 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
2277 touch_mnt_namespace(current->nsproxy->mnt_ns); 2265 touch_mnt_namespace(current->nsproxy->mnt_ns);
2278 spin_unlock(&vfsmount_lock); 2266 spin_unlock(&vfsmount_lock);
2279 chroot_fs_refs(&root, &new); 2267 chroot_fs_refs(&root, &new);
2280 security_sb_post_pivotroot(&root, &new);
2281 error = 0; 2268 error = 0;
2282 path_put(&root_parent); 2269 path_put(&root_parent);
2283 path_put(&parent_path); 2270 path_put(&parent_path);
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index b8b5b30d53f0..9578cbe0cd58 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -15,7 +15,6 @@
15#include <linux/errno.h> 15#include <linux/errno.h>
16#include <linux/stat.h> 16#include <linux/stat.h>
17#include <linux/kernel.h> 17#include <linux/kernel.h>
18#include <linux/slab.h>
19#include <linux/vmalloc.h> 18#include <linux/vmalloc.h>
20#include <linux/mm.h> 19#include <linux/mm.h>
21#include <asm/uaccess.h> 20#include <asm/uaccess.h>
@@ -50,9 +49,10 @@ extern int ncp_symlink(struct inode *, struct dentry *, const char *);
50 49
51const struct file_operations ncp_dir_operations = 50const struct file_operations ncp_dir_operations =
52{ 51{
52 .llseek = generic_file_llseek,
53 .read = generic_read_dir, 53 .read = generic_read_dir,
54 .readdir = ncp_readdir, 54 .readdir = ncp_readdir,
55 .ioctl = ncp_ioctl, 55 .unlocked_ioctl = ncp_ioctl,
56#ifdef CONFIG_COMPAT 56#ifdef CONFIG_COMPAT
57 .compat_ioctl = ncp_compat_ioctl, 57 .compat_ioctl = ncp_compat_ioctl,
58#endif 58#endif
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 6a7d901f1936..3639cc5cbdae 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -15,7 +15,6 @@
15#include <linux/fcntl.h> 15#include <linux/fcntl.h>
16#include <linux/stat.h> 16#include <linux/stat.h>
17#include <linux/mm.h> 17#include <linux/mm.h>
18#include <linux/slab.h>
19#include <linux/vmalloc.h> 18#include <linux/vmalloc.h>
20#include <linux/sched.h> 19#include <linux/sched.h>
21#include <linux/smp_lock.h> 20#include <linux/smp_lock.h>
@@ -23,7 +22,7 @@
23#include <linux/ncp_fs.h> 22#include <linux/ncp_fs.h>
24#include "ncplib_kernel.h" 23#include "ncplib_kernel.h"
25 24
26static int ncp_fsync(struct file *file, struct dentry *dentry, int datasync) 25static int ncp_fsync(struct file *file, int datasync)
27{ 26{
28 return 0; 27 return 0;
29} 28}
@@ -296,7 +295,7 @@ const struct file_operations ncp_file_operations =
296 .llseek = ncp_remote_llseek, 295 .llseek = ncp_remote_llseek,
297 .read = ncp_file_read, 296 .read = ncp_file_read,
298 .write = ncp_file_write, 297 .write = ncp_file_write,
299 .ioctl = ncp_ioctl, 298 .unlocked_ioctl = ncp_ioctl,
300#ifdef CONFIG_COMPAT 299#ifdef CONFIG_COMPAT
301 .compat_ioctl = ncp_compat_ioctl, 300 .compat_ioctl = ncp_compat_ioctl,
302#endif 301#endif
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index cf98da1be23e..fa3385154023 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -526,10 +526,15 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
526 sb->s_blocksize_bits = 10; 526 sb->s_blocksize_bits = 10;
527 sb->s_magic = NCP_SUPER_MAGIC; 527 sb->s_magic = NCP_SUPER_MAGIC;
528 sb->s_op = &ncp_sops; 528 sb->s_op = &ncp_sops;
529 sb->s_bdi = &server->bdi;
529 530
530 server = NCP_SBP(sb); 531 server = NCP_SBP(sb);
531 memset(server, 0, sizeof(*server)); 532 memset(server, 0, sizeof(*server));
532 533
534 error = bdi_setup_and_register(&server->bdi, "ncpfs", BDI_CAP_MAP_COPY);
535 if (error)
536 goto out_bdi;
537
533 server->ncp_filp = ncp_filp; 538 server->ncp_filp = ncp_filp;
534 server->ncp_sock = sock; 539 server->ncp_sock = sock;
535 540
@@ -719,6 +724,8 @@ out_fput2:
719 if (server->info_filp) 724 if (server->info_filp)
720 fput(server->info_filp); 725 fput(server->info_filp);
721out_fput: 726out_fput:
727 bdi_destroy(&server->bdi);
728out_bdi:
722 /* 23/12/1998 Marcin Dalecki <dalecki@cs.net.pl>: 729 /* 23/12/1998 Marcin Dalecki <dalecki@cs.net.pl>:
723 * 730 *
724 * The previously used put_filp(ncp_filp); was bogous, since 731 * The previously used put_filp(ncp_filp); was bogous, since
@@ -756,6 +763,7 @@ static void ncp_put_super(struct super_block *sb)
756 kill_pid(server->m.wdog_pid, SIGTERM, 1); 763 kill_pid(server->m.wdog_pid, SIGTERM, 1);
757 put_pid(server->m.wdog_pid); 764 put_pid(server->m.wdog_pid);
758 765
766 bdi_destroy(&server->bdi);
759 kfree(server->priv.data); 767 kfree(server->priv.data);
760 kfree(server->auth.object_name); 768 kfree(server->auth.object_name);
761 vfree(server->rxbuf); 769 vfree(server->rxbuf);
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index ec8f45f12e05..023c03d02070 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -15,10 +15,12 @@
15#include <linux/time.h> 15#include <linux/time.h>
16#include <linux/mm.h> 16#include <linux/mm.h>
17#include <linux/mount.h> 17#include <linux/mount.h>
18#include <linux/slab.h>
18#include <linux/highuid.h> 19#include <linux/highuid.h>
19#include <linux/smp_lock.h> 20#include <linux/smp_lock.h>
20#include <linux/vmalloc.h> 21#include <linux/vmalloc.h>
21#include <linux/sched.h> 22#include <linux/sched.h>
23#include <linux/smp_lock.h>
22 24
23#include <linux/ncp_fs.h> 25#include <linux/ncp_fs.h>
24 26
@@ -260,9 +262,9 @@ ncp_get_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
260} 262}
261#endif /* CONFIG_NCPFS_NLS */ 263#endif /* CONFIG_NCPFS_NLS */
262 264
263static int __ncp_ioctl(struct inode *inode, struct file *filp, 265static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
264 unsigned int cmd, unsigned long arg)
265{ 266{
267 struct inode *inode = filp->f_dentry->d_inode;
266 struct ncp_server *server = NCP_SERVER(inode); 268 struct ncp_server *server = NCP_SERVER(inode);
267 int result; 269 int result;
268 struct ncp_ioctl_request request; 270 struct ncp_ioctl_request request;
@@ -840,11 +842,11 @@ static int ncp_ioctl_need_write(unsigned int cmd)
840 } 842 }
841} 843}
842 844
843int ncp_ioctl(struct inode *inode, struct file *filp, 845long ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
844 unsigned int cmd, unsigned long arg)
845{ 846{
846 int ret; 847 long ret;
847 848
849 lock_kernel();
848 if (ncp_ioctl_need_write(cmd)) { 850 if (ncp_ioctl_need_write(cmd)) {
849 /* 851 /*
850 * inside the ioctl(), any failures which 852 * inside the ioctl(), any failures which
@@ -852,24 +854,28 @@ int ncp_ioctl(struct inode *inode, struct file *filp,
852 * -EACCESS, so it seems consistent to keep 854 * -EACCESS, so it seems consistent to keep
853 * that here. 855 * that here.
854 */ 856 */
855 if (mnt_want_write(filp->f_path.mnt)) 857 if (mnt_want_write(filp->f_path.mnt)) {
856 return -EACCES; 858 ret = -EACCES;
859 goto out;
860 }
857 } 861 }
858 ret = __ncp_ioctl(inode, filp, cmd, arg); 862 ret = __ncp_ioctl(filp, cmd, arg);
859 if (ncp_ioctl_need_write(cmd)) 863 if (ncp_ioctl_need_write(cmd))
860 mnt_drop_write(filp->f_path.mnt); 864 mnt_drop_write(filp->f_path.mnt);
865
866out:
867 unlock_kernel();
861 return ret; 868 return ret;
862} 869}
863 870
864#ifdef CONFIG_COMPAT 871#ifdef CONFIG_COMPAT
865long ncp_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 872long ncp_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
866{ 873{
867 struct inode *inode = file->f_path.dentry->d_inode; 874 long ret;
868 int ret;
869 875
870 lock_kernel(); 876 lock_kernel();
871 arg = (unsigned long) compat_ptr(arg); 877 arg = (unsigned long) compat_ptr(arg);
872 ret = ncp_ioctl(inode, file, cmd, arg); 878 ret = ncp_ioctl(file, cmd, arg);
873 unlock_kernel(); 879 unlock_kernel();
874 return ret; 880 return ret;
875} 881}
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index 15458decdb8a..56f5b3a0e1ee 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -9,12 +9,12 @@
9#include <linux/stat.h> 9#include <linux/stat.h>
10#include <linux/time.h> 10#include <linux/time.h>
11#include <linux/kernel.h> 11#include <linux/kernel.h>
12#include <linux/gfp.h>
12#include <linux/mm.h> 13#include <linux/mm.h>
13#include <linux/shm.h> 14#include <linux/shm.h>
14#include <linux/errno.h> 15#include <linux/errno.h>
15#include <linux/mman.h> 16#include <linux/mman.h>
16#include <linux/string.h> 17#include <linux/string.h>
17#include <linux/slab.h>
18#include <linux/fcntl.h> 18#include <linux/fcntl.h>
19#include <linux/ncp_fs.h> 19#include <linux/ncp_fs.h>
20 20
diff --git a/fs/ncpfs/sock.c b/fs/ncpfs/sock.c
index e37df8d5fe70..c7ff6c700a6e 100644
--- a/fs/ncpfs/sock.c
+++ b/fs/ncpfs/sock.c
@@ -21,6 +21,7 @@
21#include <linux/mm.h> 21#include <linux/mm.h>
22#include <linux/netdevice.h> 22#include <linux/netdevice.h>
23#include <linux/signal.h> 23#include <linux/signal.h>
24#include <linux/slab.h>
24#include <net/scm.h> 25#include <net/scm.h>
25#include <net/sock.h> 26#include <net/sock.h>
26#include <linux/ipx.h> 27#include <linux/ipx.h>
diff --git a/fs/ncpfs/symlink.c b/fs/ncpfs/symlink.c
index e3d26c1bd105..c634fd17b337 100644
--- a/fs/ncpfs/symlink.c
+++ b/fs/ncpfs/symlink.c
@@ -27,6 +27,7 @@
27#include <linux/fs.h> 27#include <linux/fs.h>
28#include <linux/ncp_fs.h> 28#include <linux/ncp_fs.h>
29#include <linux/time.h> 29#include <linux/time.h>
30#include <linux/slab.h>
30#include <linux/mm.h> 31#include <linux/mm.h>
31#include <linux/stat.h> 32#include <linux/stat.h>
32#include "ncplib_kernel.h" 33#include "ncplib_kernel.h"
diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c
index b4ffd0146ea6..84690319e625 100644
--- a/fs/nfs/cache_lib.c
+++ b/fs/nfs/cache_lib.c
@@ -10,6 +10,7 @@
10#include <linux/moduleparam.h> 10#include <linux/moduleparam.h>
11#include <linux/mount.h> 11#include <linux/mount.h>
12#include <linux/namei.h> 12#include <linux/namei.h>
13#include <linux/slab.h>
13#include <linux/sunrpc/cache.h> 14#include <linux/sunrpc/cache.h>
14#include <linux/sunrpc/rpc_pipe_fs.h> 15#include <linux/sunrpc/rpc_pipe_fs.h>
15 16
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 84761b5bb8e2..a08770a7e857 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -7,6 +7,7 @@
7 */ 7 */
8#include <linux/nfs4.h> 8#include <linux/nfs4.h>
9#include <linux/nfs_fs.h> 9#include <linux/nfs_fs.h>
10#include <linux/slab.h>
10#include "nfs4_fs.h" 11#include "nfs4_fs.h"
11#include "callback.h" 12#include "callback.h"
12#include "delegation.h" 13#include "delegation.h"
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index a2b8b4df125d..05af212f0edf 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -9,6 +9,7 @@
9#include <linux/sunrpc/svc.h> 9#include <linux/sunrpc/svc.h>
10#include <linux/nfs4.h> 10#include <linux/nfs4.h>
11#include <linux/nfs_fs.h> 11#include <linux/nfs_fs.h>
12#include <linux/slab.h>
12#include "nfs4_fs.h" 13#include "nfs4_fs.h"
13#include "callback.h" 14#include "callback.h"
14 15
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 2274f1737336..7ec9b34a59f8 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -35,6 +35,7 @@
35#include <linux/vfs.h> 35#include <linux/vfs.h>
36#include <linux/inet.h> 36#include <linux/inet.h>
37#include <linux/in6.h> 37#include <linux/in6.h>
38#include <linux/slab.h>
38#include <net/ipv6.h> 39#include <net/ipv6.h>
39#include <linux/nfs_xdr.h> 40#include <linux/nfs_xdr.h>
40#include <linux/sunrpc/bc_xprt.h> 41#include <linux/sunrpc/bc_xprt.h>
@@ -933,7 +934,6 @@ static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, str
933 } 934 }
934 935
935 fsinfo.fattr = fattr; 936 fsinfo.fattr = fattr;
936 nfs_fattr_init(fattr);
937 error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo); 937 error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo);
938 if (error < 0) 938 if (error < 0)
939 goto out_error; 939 goto out_error;
@@ -965,6 +965,8 @@ out_error:
965static void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_server *source) 965static void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_server *source)
966{ 966{
967 target->flags = source->flags; 967 target->flags = source->flags;
968 target->rsize = source->rsize;
969 target->wsize = source->wsize;
968 target->acregmin = source->acregmin; 970 target->acregmin = source->acregmin;
969 target->acregmax = source->acregmax; 971 target->acregmax = source->acregmax;
970 target->acdirmin = source->acdirmin; 972 target->acdirmin = source->acdirmin;
@@ -1044,13 +1046,18 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data,
1044 struct nfs_fh *mntfh) 1046 struct nfs_fh *mntfh)
1045{ 1047{
1046 struct nfs_server *server; 1048 struct nfs_server *server;
1047 struct nfs_fattr fattr; 1049 struct nfs_fattr *fattr;
1048 int error; 1050 int error;
1049 1051
1050 server = nfs_alloc_server(); 1052 server = nfs_alloc_server();
1051 if (!server) 1053 if (!server)
1052 return ERR_PTR(-ENOMEM); 1054 return ERR_PTR(-ENOMEM);
1053 1055
1056 error = -ENOMEM;
1057 fattr = nfs_alloc_fattr();
1058 if (fattr == NULL)
1059 goto error;
1060
1054 /* Get a client representation */ 1061 /* Get a client representation */
1055 error = nfs_init_server(server, data); 1062 error = nfs_init_server(server, data);
1056 if (error < 0) 1063 if (error < 0)
@@ -1061,7 +1068,7 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data,
1061 BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops); 1068 BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
1062 1069
1063 /* Probe the root fh to retrieve its FSID */ 1070 /* Probe the root fh to retrieve its FSID */
1064 error = nfs_probe_fsinfo(server, mntfh, &fattr); 1071 error = nfs_probe_fsinfo(server, mntfh, fattr);
1065 if (error < 0) 1072 if (error < 0)
1066 goto error; 1073 goto error;
1067 if (server->nfs_client->rpc_ops->version == 3) { 1074 if (server->nfs_client->rpc_ops->version == 3) {
@@ -1074,14 +1081,14 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data,
1074 server->namelen = NFS2_MAXNAMLEN; 1081 server->namelen = NFS2_MAXNAMLEN;
1075 } 1082 }
1076 1083
1077 if (!(fattr.valid & NFS_ATTR_FATTR)) { 1084 if (!(fattr->valid & NFS_ATTR_FATTR)) {
1078 error = server->nfs_client->rpc_ops->getattr(server, mntfh, &fattr); 1085 error = server->nfs_client->rpc_ops->getattr(server, mntfh, fattr);
1079 if (error < 0) { 1086 if (error < 0) {
1080 dprintk("nfs_create_server: getattr error = %d\n", -error); 1087 dprintk("nfs_create_server: getattr error = %d\n", -error);
1081 goto error; 1088 goto error;
1082 } 1089 }
1083 } 1090 }
1084 memcpy(&server->fsid, &fattr.fsid, sizeof(server->fsid)); 1091 memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid));
1085 1092
1086 dprintk("Server FSID: %llx:%llx\n", 1093 dprintk("Server FSID: %llx:%llx\n",
1087 (unsigned long long) server->fsid.major, 1094 (unsigned long long) server->fsid.major,
@@ -1093,9 +1100,11 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data,
1093 spin_unlock(&nfs_client_lock); 1100 spin_unlock(&nfs_client_lock);
1094 1101
1095 server->mount_time = jiffies; 1102 server->mount_time = jiffies;
1103 nfs_free_fattr(fattr);
1096 return server; 1104 return server;
1097 1105
1098error: 1106error:
1107 nfs_free_fattr(fattr);
1099 nfs_free_server(server); 1108 nfs_free_server(server);
1100 return ERR_PTR(error); 1109 return ERR_PTR(error);
1101} 1110}
@@ -1293,7 +1302,8 @@ static int nfs4_init_server(struct nfs_server *server,
1293 1302
1294 /* Initialise the client representation from the mount data */ 1303 /* Initialise the client representation from the mount data */
1295 server->flags = data->flags; 1304 server->flags = data->flags;
1296 server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR; 1305 server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR|
1306 NFS_CAP_POSIX_LOCK;
1297 server->options = data->options; 1307 server->options = data->options;
1298 1308
1299 /* Get a client record */ 1309 /* Get a client record */
@@ -1336,7 +1346,7 @@ error:
1336struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data, 1346struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
1337 struct nfs_fh *mntfh) 1347 struct nfs_fh *mntfh)
1338{ 1348{
1339 struct nfs_fattr fattr; 1349 struct nfs_fattr *fattr;
1340 struct nfs_server *server; 1350 struct nfs_server *server;
1341 int error; 1351 int error;
1342 1352
@@ -1346,6 +1356,11 @@ struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
1346 if (!server) 1356 if (!server)
1347 return ERR_PTR(-ENOMEM); 1357 return ERR_PTR(-ENOMEM);
1348 1358
1359 error = -ENOMEM;
1360 fattr = nfs_alloc_fattr();
1361 if (fattr == NULL)
1362 goto error;
1363
1349 /* set up the general RPC client */ 1364 /* set up the general RPC client */
1350 error = nfs4_init_server(server, data); 1365 error = nfs4_init_server(server, data);
1351 if (error < 0) 1366 if (error < 0)
@@ -1360,7 +1375,7 @@ struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
1360 goto error; 1375 goto error;
1361 1376
1362 /* Probe the root fh to retrieve its FSID */ 1377 /* Probe the root fh to retrieve its FSID */
1363 error = nfs4_path_walk(server, mntfh, data->nfs_server.export_path); 1378 error = nfs4_get_rootfh(server, mntfh);
1364 if (error < 0) 1379 if (error < 0)
1365 goto error; 1380 goto error;
1366 1381
@@ -1371,7 +1386,7 @@ struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
1371 1386
1372 nfs4_session_set_rwsize(server); 1387 nfs4_session_set_rwsize(server);
1373 1388
1374 error = nfs_probe_fsinfo(server, mntfh, &fattr); 1389 error = nfs_probe_fsinfo(server, mntfh, fattr);
1375 if (error < 0) 1390 if (error < 0)
1376 goto error; 1391 goto error;
1377 1392
@@ -1385,9 +1400,11 @@ struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
1385 1400
1386 server->mount_time = jiffies; 1401 server->mount_time = jiffies;
1387 dprintk("<-- nfs4_create_server() = %p\n", server); 1402 dprintk("<-- nfs4_create_server() = %p\n", server);
1403 nfs_free_fattr(fattr);
1388 return server; 1404 return server;
1389 1405
1390error: 1406error:
1407 nfs_free_fattr(fattr);
1391 nfs_free_server(server); 1408 nfs_free_server(server);
1392 dprintk("<-- nfs4_create_server() = error %d\n", error); 1409 dprintk("<-- nfs4_create_server() = error %d\n", error);
1393 return ERR_PTR(error); 1410 return ERR_PTR(error);
@@ -1401,7 +1418,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
1401{ 1418{
1402 struct nfs_client *parent_client; 1419 struct nfs_client *parent_client;
1403 struct nfs_server *server, *parent_server; 1420 struct nfs_server *server, *parent_server;
1404 struct nfs_fattr fattr; 1421 struct nfs_fattr *fattr;
1405 int error; 1422 int error;
1406 1423
1407 dprintk("--> nfs4_create_referral_server()\n"); 1424 dprintk("--> nfs4_create_referral_server()\n");
@@ -1410,6 +1427,11 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
1410 if (!server) 1427 if (!server)
1411 return ERR_PTR(-ENOMEM); 1428 return ERR_PTR(-ENOMEM);
1412 1429
1430 error = -ENOMEM;
1431 fattr = nfs_alloc_fattr();
1432 if (fattr == NULL)
1433 goto error;
1434
1413 parent_server = NFS_SB(data->sb); 1435 parent_server = NFS_SB(data->sb);
1414 parent_client = parent_server->nfs_client; 1436 parent_client = parent_server->nfs_client;
1415 1437
@@ -1439,12 +1461,12 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
1439 BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops); 1461 BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
1440 1462
1441 /* Probe the root fh to retrieve its FSID and filehandle */ 1463 /* Probe the root fh to retrieve its FSID and filehandle */
1442 error = nfs4_path_walk(server, mntfh, data->mnt_path); 1464 error = nfs4_get_rootfh(server, mntfh);
1443 if (error < 0) 1465 if (error < 0)
1444 goto error; 1466 goto error;
1445 1467
1446 /* probe the filesystem info for this server filesystem */ 1468 /* probe the filesystem info for this server filesystem */
1447 error = nfs_probe_fsinfo(server, mntfh, &fattr); 1469 error = nfs_probe_fsinfo(server, mntfh, fattr);
1448 if (error < 0) 1470 if (error < 0)
1449 goto error; 1471 goto error;
1450 1472
@@ -1462,10 +1484,12 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
1462 1484
1463 server->mount_time = jiffies; 1485 server->mount_time = jiffies;
1464 1486
1487 nfs_free_fattr(fattr);
1465 dprintk("<-- nfs_create_referral_server() = %p\n", server); 1488 dprintk("<-- nfs_create_referral_server() = %p\n", server);
1466 return server; 1489 return server;
1467 1490
1468error: 1491error:
1492 nfs_free_fattr(fattr);
1469 nfs_free_server(server); 1493 nfs_free_server(server);
1470 dprintk("<-- nfs4_create_referral_server() = error %d\n", error); 1494 dprintk("<-- nfs4_create_referral_server() = error %d\n", error);
1471 return ERR_PTR(error); 1495 return ERR_PTR(error);
@@ -1481,7 +1505,7 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
1481 struct nfs_fattr *fattr) 1505 struct nfs_fattr *fattr)
1482{ 1506{
1483 struct nfs_server *server; 1507 struct nfs_server *server;
1484 struct nfs_fattr fattr_fsinfo; 1508 struct nfs_fattr *fattr_fsinfo;
1485 int error; 1509 int error;
1486 1510
1487 dprintk("--> nfs_clone_server(,%llx:%llx,)\n", 1511 dprintk("--> nfs_clone_server(,%llx:%llx,)\n",
@@ -1492,6 +1516,11 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
1492 if (!server) 1516 if (!server)
1493 return ERR_PTR(-ENOMEM); 1517 return ERR_PTR(-ENOMEM);
1494 1518
1519 error = -ENOMEM;
1520 fattr_fsinfo = nfs_alloc_fattr();
1521 if (fattr_fsinfo == NULL)
1522 goto out_free_server;
1523
1495 /* Copy data from the source */ 1524 /* Copy data from the source */
1496 server->nfs_client = source->nfs_client; 1525 server->nfs_client = source->nfs_client;
1497 atomic_inc(&server->nfs_client->cl_count); 1526 atomic_inc(&server->nfs_client->cl_count);
@@ -1508,7 +1537,7 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
1508 nfs_init_server_aclclient(server); 1537 nfs_init_server_aclclient(server);
1509 1538
1510 /* probe the filesystem info for this server filesystem */ 1539 /* probe the filesystem info for this server filesystem */
1511 error = nfs_probe_fsinfo(server, fh, &fattr_fsinfo); 1540 error = nfs_probe_fsinfo(server, fh, fattr_fsinfo);
1512 if (error < 0) 1541 if (error < 0)
1513 goto out_free_server; 1542 goto out_free_server;
1514 1543
@@ -1530,10 +1559,12 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
1530 1559
1531 server->mount_time = jiffies; 1560 server->mount_time = jiffies;
1532 1561
1562 nfs_free_fattr(fattr_fsinfo);
1533 dprintk("<-- nfs_clone_server() = %p\n", server); 1563 dprintk("<-- nfs_clone_server() = %p\n", server);
1534 return server; 1564 return server;
1535 1565
1536out_free_server: 1566out_free_server:
1567 nfs_free_fattr(fattr_fsinfo);
1537 nfs_free_server(server); 1568 nfs_free_server(server);
1538 dprintk("<-- nfs_clone_server() = error %d\n", error); 1569 dprintk("<-- nfs_clone_server() = error %d\n", error);
1539 return ERR_PTR(error); 1570 return ERR_PTR(error);
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 2563bebc4c67..301634543974 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -10,6 +10,7 @@
10#include <linux/kthread.h> 10#include <linux/kthread.h>
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/sched.h> 12#include <linux/sched.h>
13#include <linux/slab.h>
13#include <linux/smp_lock.h> 14#include <linux/smp_lock.h>
14#include <linux/spinlock.h> 15#include <linux/spinlock.h>
15 16
@@ -23,6 +24,8 @@
23 24
24static void nfs_do_free_delegation(struct nfs_delegation *delegation) 25static void nfs_do_free_delegation(struct nfs_delegation *delegation)
25{ 26{
27 if (delegation->cred)
28 put_rpccred(delegation->cred);
26 kfree(delegation); 29 kfree(delegation);
27} 30}
28 31
@@ -35,13 +38,7 @@ static void nfs_free_delegation_callback(struct rcu_head *head)
35 38
36static void nfs_free_delegation(struct nfs_delegation *delegation) 39static void nfs_free_delegation(struct nfs_delegation *delegation)
37{ 40{
38 struct rpc_cred *cred;
39
40 cred = rcu_dereference(delegation->cred);
41 rcu_assign_pointer(delegation->cred, NULL);
42 call_rcu(&delegation->rcu, nfs_free_delegation_callback); 41 call_rcu(&delegation->rcu, nfs_free_delegation_callback);
43 if (cred)
44 put_rpccred(cred);
45} 42}
46 43
47void nfs_mark_delegation_referenced(struct nfs_delegation *delegation) 44void nfs_mark_delegation_referenced(struct nfs_delegation *delegation)
@@ -128,21 +125,35 @@ again:
128 */ 125 */
129void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res) 126void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res)
130{ 127{
131 struct nfs_delegation *delegation = NFS_I(inode)->delegation; 128 struct nfs_delegation *delegation;
132 struct rpc_cred *oldcred; 129 struct rpc_cred *oldcred = NULL;
133 130
134 if (delegation == NULL) 131 rcu_read_lock();
135 return; 132 delegation = rcu_dereference(NFS_I(inode)->delegation);
136 memcpy(delegation->stateid.data, res->delegation.data, 133 if (delegation != NULL) {
137 sizeof(delegation->stateid.data)); 134 spin_lock(&delegation->lock);
138 delegation->type = res->delegation_type; 135 if (delegation->inode != NULL) {
139 delegation->maxsize = res->maxsize; 136 memcpy(delegation->stateid.data, res->delegation.data,
140 oldcred = delegation->cred; 137 sizeof(delegation->stateid.data));
141 delegation->cred = get_rpccred(cred); 138 delegation->type = res->delegation_type;
142 clear_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags); 139 delegation->maxsize = res->maxsize;
143 NFS_I(inode)->delegation_state = delegation->type; 140 oldcred = delegation->cred;
144 smp_wmb(); 141 delegation->cred = get_rpccred(cred);
145 put_rpccred(oldcred); 142 clear_bit(NFS_DELEGATION_NEED_RECLAIM,
143 &delegation->flags);
144 NFS_I(inode)->delegation_state = delegation->type;
145 spin_unlock(&delegation->lock);
146 put_rpccred(oldcred);
147 rcu_read_unlock();
148 } else {
149 /* We appear to have raced with a delegation return. */
150 spin_unlock(&delegation->lock);
151 rcu_read_unlock();
152 nfs_inode_set_delegation(inode, cred, res);
153 }
154 } else {
155 rcu_read_unlock();
156 }
146} 157}
147 158
148static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync) 159static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync)
@@ -165,9 +176,13 @@ static struct inode *nfs_delegation_grab_inode(struct nfs_delegation *delegation
165 return inode; 176 return inode;
166} 177}
167 178
168static struct nfs_delegation *nfs_detach_delegation_locked(struct nfs_inode *nfsi, const nfs4_stateid *stateid) 179static struct nfs_delegation *nfs_detach_delegation_locked(struct nfs_inode *nfsi,
180 const nfs4_stateid *stateid,
181 struct nfs_client *clp)
169{ 182{
170 struct nfs_delegation *delegation = rcu_dereference(nfsi->delegation); 183 struct nfs_delegation *delegation =
184 rcu_dereference_protected(nfsi->delegation,
185 lockdep_is_held(&clp->cl_lock));
171 186
172 if (delegation == NULL) 187 if (delegation == NULL)
173 goto nomatch; 188 goto nomatch;
@@ -194,11 +209,11 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
194{ 209{
195 struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; 210 struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
196 struct nfs_inode *nfsi = NFS_I(inode); 211 struct nfs_inode *nfsi = NFS_I(inode);
197 struct nfs_delegation *delegation; 212 struct nfs_delegation *delegation, *old_delegation;
198 struct nfs_delegation *freeme = NULL; 213 struct nfs_delegation *freeme = NULL;
199 int status = 0; 214 int status = 0;
200 215
201 delegation = kmalloc(sizeof(*delegation), GFP_KERNEL); 216 delegation = kmalloc(sizeof(*delegation), GFP_NOFS);
202 if (delegation == NULL) 217 if (delegation == NULL)
203 return -ENOMEM; 218 return -ENOMEM;
204 memcpy(delegation->stateid.data, res->delegation.data, 219 memcpy(delegation->stateid.data, res->delegation.data,
@@ -212,10 +227,12 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
212 spin_lock_init(&delegation->lock); 227 spin_lock_init(&delegation->lock);
213 228
214 spin_lock(&clp->cl_lock); 229 spin_lock(&clp->cl_lock);
215 if (rcu_dereference(nfsi->delegation) != NULL) { 230 old_delegation = rcu_dereference_protected(nfsi->delegation,
216 if (memcmp(&delegation->stateid, &nfsi->delegation->stateid, 231 lockdep_is_held(&clp->cl_lock));
217 sizeof(delegation->stateid)) == 0 && 232 if (old_delegation != NULL) {
218 delegation->type == nfsi->delegation->type) { 233 if (memcmp(&delegation->stateid, &old_delegation->stateid,
234 sizeof(old_delegation->stateid)) == 0 &&
235 delegation->type == old_delegation->type) {
219 goto out; 236 goto out;
220 } 237 }
221 /* 238 /*
@@ -225,12 +242,12 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
225 dfprintk(FILE, "%s: server %s handed out " 242 dfprintk(FILE, "%s: server %s handed out "
226 "a duplicate delegation!\n", 243 "a duplicate delegation!\n",
227 __func__, clp->cl_hostname); 244 __func__, clp->cl_hostname);
228 if (delegation->type <= nfsi->delegation->type) { 245 if (delegation->type <= old_delegation->type) {
229 freeme = delegation; 246 freeme = delegation;
230 delegation = NULL; 247 delegation = NULL;
231 goto out; 248 goto out;
232 } 249 }
233 freeme = nfs_detach_delegation_locked(nfsi, NULL); 250 freeme = nfs_detach_delegation_locked(nfsi, NULL, clp);
234 } 251 }
235 list_add_rcu(&delegation->super_list, &clp->cl_delegations); 252 list_add_rcu(&delegation->super_list, &clp->cl_delegations);
236 nfsi->delegation_state = delegation->type; 253 nfsi->delegation_state = delegation->type;
@@ -300,7 +317,7 @@ restart:
300 if (inode == NULL) 317 if (inode == NULL)
301 continue; 318 continue;
302 spin_lock(&clp->cl_lock); 319 spin_lock(&clp->cl_lock);
303 delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL); 320 delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL, clp);
304 spin_unlock(&clp->cl_lock); 321 spin_unlock(&clp->cl_lock);
305 rcu_read_unlock(); 322 rcu_read_unlock();
306 if (delegation != NULL) { 323 if (delegation != NULL) {
@@ -329,9 +346,9 @@ void nfs_inode_return_delegation_noreclaim(struct inode *inode)
329 struct nfs_inode *nfsi = NFS_I(inode); 346 struct nfs_inode *nfsi = NFS_I(inode);
330 struct nfs_delegation *delegation; 347 struct nfs_delegation *delegation;
331 348
332 if (rcu_dereference(nfsi->delegation) != NULL) { 349 if (rcu_access_pointer(nfsi->delegation) != NULL) {
333 spin_lock(&clp->cl_lock); 350 spin_lock(&clp->cl_lock);
334 delegation = nfs_detach_delegation_locked(nfsi, NULL); 351 delegation = nfs_detach_delegation_locked(nfsi, NULL, clp);
335 spin_unlock(&clp->cl_lock); 352 spin_unlock(&clp->cl_lock);
336 if (delegation != NULL) 353 if (delegation != NULL)
337 nfs_do_return_delegation(inode, delegation, 0); 354 nfs_do_return_delegation(inode, delegation, 0);
@@ -345,9 +362,9 @@ int nfs_inode_return_delegation(struct inode *inode)
345 struct nfs_delegation *delegation; 362 struct nfs_delegation *delegation;
346 int err = 0; 363 int err = 0;
347 364
348 if (rcu_dereference(nfsi->delegation) != NULL) { 365 if (rcu_access_pointer(nfsi->delegation) != NULL) {
349 spin_lock(&clp->cl_lock); 366 spin_lock(&clp->cl_lock);
350 delegation = nfs_detach_delegation_locked(nfsi, NULL); 367 delegation = nfs_detach_delegation_locked(nfsi, NULL, clp);
351 spin_unlock(&clp->cl_lock); 368 spin_unlock(&clp->cl_lock);
352 if (delegation != NULL) { 369 if (delegation != NULL) {
353 nfs_msync_inode(inode); 370 nfs_msync_inode(inode);
@@ -525,7 +542,7 @@ restart:
525 if (inode == NULL) 542 if (inode == NULL)
526 continue; 543 continue;
527 spin_lock(&clp->cl_lock); 544 spin_lock(&clp->cl_lock);
528 delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL); 545 delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL, clp);
529 spin_unlock(&clp->cl_lock); 546 spin_unlock(&clp->cl_lock);
530 rcu_read_unlock(); 547 rcu_read_unlock();
531 if (delegation != NULL) 548 if (delegation != NULL)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index c6f2750648f4..782b431ef91c 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -53,7 +53,7 @@ static int nfs_link(struct dentry *, struct inode *, struct dentry *);
53static int nfs_mknod(struct inode *, struct dentry *, int, dev_t); 53static int nfs_mknod(struct inode *, struct dentry *, int, dev_t);
54static int nfs_rename(struct inode *, struct dentry *, 54static int nfs_rename(struct inode *, struct dentry *,
55 struct inode *, struct dentry *); 55 struct inode *, struct dentry *);
56static int nfs_fsync_dir(struct file *, struct dentry *, int); 56static int nfs_fsync_dir(struct file *, int);
57static loff_t nfs_llseek_dir(struct file *, loff_t, int); 57static loff_t nfs_llseek_dir(struct file *, loff_t, int);
58 58
59const struct file_operations nfs_dir_operations = { 59const struct file_operations nfs_dir_operations = {
@@ -530,9 +530,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
530 nfs_readdir_descriptor_t my_desc, 530 nfs_readdir_descriptor_t my_desc,
531 *desc = &my_desc; 531 *desc = &my_desc;
532 struct nfs_entry my_entry; 532 struct nfs_entry my_entry;
533 struct nfs_fh fh; 533 int res = -ENOMEM;
534 struct nfs_fattr fattr;
535 long res;
536 534
537 dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n", 535 dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n",
538 dentry->d_parent->d_name.name, dentry->d_name.name, 536 dentry->d_parent->d_name.name, dentry->d_name.name,
@@ -554,9 +552,11 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
554 552
555 my_entry.cookie = my_entry.prev_cookie = 0; 553 my_entry.cookie = my_entry.prev_cookie = 0;
556 my_entry.eof = 0; 554 my_entry.eof = 0;
557 my_entry.fh = &fh; 555 my_entry.fh = nfs_alloc_fhandle();
558 my_entry.fattr = &fattr; 556 my_entry.fattr = nfs_alloc_fattr();
559 nfs_fattr_init(&fattr); 557 if (my_entry.fh == NULL || my_entry.fattr == NULL)
558 goto out_alloc_failed;
559
560 desc->entry = &my_entry; 560 desc->entry = &my_entry;
561 561
562 nfs_block_sillyrename(dentry); 562 nfs_block_sillyrename(dentry);
@@ -598,7 +598,10 @@ out:
598 nfs_unblock_sillyrename(dentry); 598 nfs_unblock_sillyrename(dentry);
599 if (res > 0) 599 if (res > 0)
600 res = 0; 600 res = 0;
601 dfprintk(FILE, "NFS: readdir(%s/%s) returns %ld\n", 601out_alloc_failed:
602 nfs_free_fattr(my_entry.fattr);
603 nfs_free_fhandle(my_entry.fh);
604 dfprintk(FILE, "NFS: readdir(%s/%s) returns %d\n",
602 dentry->d_parent->d_name.name, dentry->d_name.name, 605 dentry->d_parent->d_name.name, dentry->d_name.name,
603 res); 606 res);
604 return res; 607 return res;
@@ -638,8 +641,10 @@ out:
638 * All directory operations under NFS are synchronous, so fsync() 641 * All directory operations under NFS are synchronous, so fsync()
639 * is a dummy operation. 642 * is a dummy operation.
640 */ 643 */
641static int nfs_fsync_dir(struct file *filp, struct dentry *dentry, int datasync) 644static int nfs_fsync_dir(struct file *filp, int datasync)
642{ 645{
646 struct dentry *dentry = filp->f_path.dentry;
647
643 dfprintk(FILE, "NFS: fsync dir(%s/%s) datasync %d\n", 648 dfprintk(FILE, "NFS: fsync dir(%s/%s) datasync %d\n",
644 dentry->d_parent->d_name.name, dentry->d_name.name, 649 dentry->d_parent->d_name.name, dentry->d_name.name,
645 datasync); 650 datasync);
@@ -776,9 +781,9 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
776 struct inode *dir; 781 struct inode *dir;
777 struct inode *inode; 782 struct inode *inode;
778 struct dentry *parent; 783 struct dentry *parent;
784 struct nfs_fh *fhandle = NULL;
785 struct nfs_fattr *fattr = NULL;
779 int error; 786 int error;
780 struct nfs_fh fhandle;
781 struct nfs_fattr fattr;
782 787
783 parent = dget_parent(dentry); 788 parent = dget_parent(dentry);
784 dir = parent->d_inode; 789 dir = parent->d_inode;
@@ -811,14 +816,22 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
811 if (NFS_STALE(inode)) 816 if (NFS_STALE(inode))
812 goto out_bad; 817 goto out_bad;
813 818
814 error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, &fhandle, &fattr); 819 error = -ENOMEM;
820 fhandle = nfs_alloc_fhandle();
821 fattr = nfs_alloc_fattr();
822 if (fhandle == NULL || fattr == NULL)
823 goto out_error;
824
825 error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr);
815 if (error) 826 if (error)
816 goto out_bad; 827 goto out_bad;
817 if (nfs_compare_fh(NFS_FH(inode), &fhandle)) 828 if (nfs_compare_fh(NFS_FH(inode), fhandle))
818 goto out_bad; 829 goto out_bad;
819 if ((error = nfs_refresh_inode(inode, &fattr)) != 0) 830 if ((error = nfs_refresh_inode(inode, fattr)) != 0)
820 goto out_bad; 831 goto out_bad;
821 832
833 nfs_free_fattr(fattr);
834 nfs_free_fhandle(fhandle);
822out_set_verifier: 835out_set_verifier:
823 nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); 836 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
824 out_valid: 837 out_valid:
@@ -837,14 +850,26 @@ out_zap_parent:
837 /* If we have submounts, don't unhash ! */ 850 /* If we have submounts, don't unhash ! */
838 if (have_submounts(dentry)) 851 if (have_submounts(dentry))
839 goto out_valid; 852 goto out_valid;
853 if (dentry->d_flags & DCACHE_DISCONNECTED)
854 goto out_valid;
840 shrink_dcache_parent(dentry); 855 shrink_dcache_parent(dentry);
841 } 856 }
842 d_drop(dentry); 857 d_drop(dentry);
858 nfs_free_fattr(fattr);
859 nfs_free_fhandle(fhandle);
843 dput(parent); 860 dput(parent);
844 dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is invalid\n", 861 dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is invalid\n",
845 __func__, dentry->d_parent->d_name.name, 862 __func__, dentry->d_parent->d_name.name,
846 dentry->d_name.name); 863 dentry->d_name.name);
847 return 0; 864 return 0;
865out_error:
866 nfs_free_fattr(fattr);
867 nfs_free_fhandle(fhandle);
868 dput(parent);
869 dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) lookup returned error %d\n",
870 __func__, dentry->d_parent->d_name.name,
871 dentry->d_name.name, error);
872 return error;
848} 873}
849 874
850/* 875/*
@@ -909,9 +934,9 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
909 struct dentry *res; 934 struct dentry *res;
910 struct dentry *parent; 935 struct dentry *parent;
911 struct inode *inode = NULL; 936 struct inode *inode = NULL;
937 struct nfs_fh *fhandle = NULL;
938 struct nfs_fattr *fattr = NULL;
912 int error; 939 int error;
913 struct nfs_fh fhandle;
914 struct nfs_fattr fattr;
915 940
916 dfprintk(VFS, "NFS: lookup(%s/%s)\n", 941 dfprintk(VFS, "NFS: lookup(%s/%s)\n",
917 dentry->d_parent->d_name.name, dentry->d_name.name); 942 dentry->d_parent->d_name.name, dentry->d_name.name);
@@ -921,7 +946,6 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
921 if (dentry->d_name.len > NFS_SERVER(dir)->namelen) 946 if (dentry->d_name.len > NFS_SERVER(dir)->namelen)
922 goto out; 947 goto out;
923 948
924 res = ERR_PTR(-ENOMEM);
925 dentry->d_op = NFS_PROTO(dir)->dentry_ops; 949 dentry->d_op = NFS_PROTO(dir)->dentry_ops;
926 950
927 /* 951 /*
@@ -934,17 +958,23 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
934 goto out; 958 goto out;
935 } 959 }
936 960
961 res = ERR_PTR(-ENOMEM);
962 fhandle = nfs_alloc_fhandle();
963 fattr = nfs_alloc_fattr();
964 if (fhandle == NULL || fattr == NULL)
965 goto out;
966
937 parent = dentry->d_parent; 967 parent = dentry->d_parent;
938 /* Protect against concurrent sillydeletes */ 968 /* Protect against concurrent sillydeletes */
939 nfs_block_sillyrename(parent); 969 nfs_block_sillyrename(parent);
940 error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, &fhandle, &fattr); 970 error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr);
941 if (error == -ENOENT) 971 if (error == -ENOENT)
942 goto no_entry; 972 goto no_entry;
943 if (error < 0) { 973 if (error < 0) {
944 res = ERR_PTR(error); 974 res = ERR_PTR(error);
945 goto out_unblock_sillyrename; 975 goto out_unblock_sillyrename;
946 } 976 }
947 inode = nfs_fhget(dentry->d_sb, &fhandle, &fattr); 977 inode = nfs_fhget(dentry->d_sb, fhandle, fattr);
948 res = (struct dentry *)inode; 978 res = (struct dentry *)inode;
949 if (IS_ERR(res)) 979 if (IS_ERR(res))
950 goto out_unblock_sillyrename; 980 goto out_unblock_sillyrename;
@@ -960,6 +990,8 @@ no_entry:
960out_unblock_sillyrename: 990out_unblock_sillyrename:
961 nfs_unblock_sillyrename(parent); 991 nfs_unblock_sillyrename(parent);
962out: 992out:
993 nfs_free_fattr(fattr);
994 nfs_free_fhandle(fhandle);
963 return res; 995 return res;
964} 996}
965 997
@@ -1025,12 +1057,12 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
1025 res = NULL; 1057 res = NULL;
1026 goto out; 1058 goto out;
1027 /* This turned out not to be a regular file */ 1059 /* This turned out not to be a regular file */
1060 case -EISDIR:
1028 case -ENOTDIR: 1061 case -ENOTDIR:
1029 goto no_open; 1062 goto no_open;
1030 case -ELOOP: 1063 case -ELOOP:
1031 if (!(nd->intent.open.flags & O_NOFOLLOW)) 1064 if (!(nd->intent.open.flags & O_NOFOLLOW))
1032 goto no_open; 1065 goto no_open;
1033 /* case -EISDIR: */
1034 /* case -EINVAL: */ 1066 /* case -EINVAL: */
1035 default: 1067 default:
1036 goto out; 1068 goto out;
@@ -1050,7 +1082,7 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
1050 struct inode *dir; 1082 struct inode *dir;
1051 int openflags, ret = 0; 1083 int openflags, ret = 0;
1052 1084
1053 if (!is_atomic_open(nd)) 1085 if (!is_atomic_open(nd) || d_mountpoint(dentry))
1054 goto no_open; 1086 goto no_open;
1055 parent = dget_parent(dentry); 1087 parent = dget_parent(dentry);
1056 dir = parent->d_inode; 1088 dir = parent->d_inode;
@@ -1667,28 +1699,33 @@ static void nfs_access_free_entry(struct nfs_access_entry *entry)
1667 smp_mb__after_atomic_dec(); 1699 smp_mb__after_atomic_dec();
1668} 1700}
1669 1701
1702static void nfs_access_free_list(struct list_head *head)
1703{
1704 struct nfs_access_entry *cache;
1705
1706 while (!list_empty(head)) {
1707 cache = list_entry(head->next, struct nfs_access_entry, lru);
1708 list_del(&cache->lru);
1709 nfs_access_free_entry(cache);
1710 }
1711}
1712
1670int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask) 1713int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask)
1671{ 1714{
1672 LIST_HEAD(head); 1715 LIST_HEAD(head);
1673 struct nfs_inode *nfsi; 1716 struct nfs_inode *nfsi;
1674 struct nfs_access_entry *cache; 1717 struct nfs_access_entry *cache;
1675 1718
1676restart: 1719 if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL)
1720 return (nr_to_scan == 0) ? 0 : -1;
1721
1677 spin_lock(&nfs_access_lru_lock); 1722 spin_lock(&nfs_access_lru_lock);
1678 list_for_each_entry(nfsi, &nfs_access_lru_list, access_cache_inode_lru) { 1723 list_for_each_entry(nfsi, &nfs_access_lru_list, access_cache_inode_lru) {
1679 struct rw_semaphore *s_umount;
1680 struct inode *inode; 1724 struct inode *inode;
1681 1725
1682 if (nr_to_scan-- == 0) 1726 if (nr_to_scan-- == 0)
1683 break; 1727 break;
1684 s_umount = &nfsi->vfs_inode.i_sb->s_umount; 1728 inode = &nfsi->vfs_inode;
1685 if (!down_read_trylock(s_umount))
1686 continue;
1687 inode = igrab(&nfsi->vfs_inode);
1688 if (inode == NULL) {
1689 up_read(s_umount);
1690 continue;
1691 }
1692 spin_lock(&inode->i_lock); 1729 spin_lock(&inode->i_lock);
1693 if (list_empty(&nfsi->access_cache_entry_lru)) 1730 if (list_empty(&nfsi->access_cache_entry_lru))
1694 goto remove_lru_entry; 1731 goto remove_lru_entry;
@@ -1702,61 +1739,48 @@ restart:
1702 else { 1739 else {
1703remove_lru_entry: 1740remove_lru_entry:
1704 list_del_init(&nfsi->access_cache_inode_lru); 1741 list_del_init(&nfsi->access_cache_inode_lru);
1742 smp_mb__before_clear_bit();
1705 clear_bit(NFS_INO_ACL_LRU_SET, &nfsi->flags); 1743 clear_bit(NFS_INO_ACL_LRU_SET, &nfsi->flags);
1744 smp_mb__after_clear_bit();
1706 } 1745 }
1707 spin_unlock(&inode->i_lock); 1746 spin_unlock(&inode->i_lock);
1708 spin_unlock(&nfs_access_lru_lock);
1709 iput(inode);
1710 up_read(s_umount);
1711 goto restart;
1712 } 1747 }
1713 spin_unlock(&nfs_access_lru_lock); 1748 spin_unlock(&nfs_access_lru_lock);
1714 while (!list_empty(&head)) { 1749 nfs_access_free_list(&head);
1715 cache = list_entry(head.next, struct nfs_access_entry, lru);
1716 list_del(&cache->lru);
1717 nfs_access_free_entry(cache);
1718 }
1719 return (atomic_long_read(&nfs_access_nr_entries) / 100) * sysctl_vfs_cache_pressure; 1750 return (atomic_long_read(&nfs_access_nr_entries) / 100) * sysctl_vfs_cache_pressure;
1720} 1751}
1721 1752
1722static void __nfs_access_zap_cache(struct inode *inode) 1753static void __nfs_access_zap_cache(struct nfs_inode *nfsi, struct list_head *head)
1723{ 1754{
1724 struct nfs_inode *nfsi = NFS_I(inode);
1725 struct rb_root *root_node = &nfsi->access_cache; 1755 struct rb_root *root_node = &nfsi->access_cache;
1726 struct rb_node *n, *dispose = NULL; 1756 struct rb_node *n;
1727 struct nfs_access_entry *entry; 1757 struct nfs_access_entry *entry;
1728 1758
1729 /* Unhook entries from the cache */ 1759 /* Unhook entries from the cache */
1730 while ((n = rb_first(root_node)) != NULL) { 1760 while ((n = rb_first(root_node)) != NULL) {
1731 entry = rb_entry(n, struct nfs_access_entry, rb_node); 1761 entry = rb_entry(n, struct nfs_access_entry, rb_node);
1732 rb_erase(n, root_node); 1762 rb_erase(n, root_node);
1733 list_del(&entry->lru); 1763 list_move(&entry->lru, head);
1734 n->rb_left = dispose;
1735 dispose = n;
1736 } 1764 }
1737 nfsi->cache_validity &= ~NFS_INO_INVALID_ACCESS; 1765 nfsi->cache_validity &= ~NFS_INO_INVALID_ACCESS;
1738 spin_unlock(&inode->i_lock);
1739
1740 /* Now kill them all! */
1741 while (dispose != NULL) {
1742 n = dispose;
1743 dispose = n->rb_left;
1744 nfs_access_free_entry(rb_entry(n, struct nfs_access_entry, rb_node));
1745 }
1746} 1766}
1747 1767
1748void nfs_access_zap_cache(struct inode *inode) 1768void nfs_access_zap_cache(struct inode *inode)
1749{ 1769{
1770 LIST_HEAD(head);
1771
1772 if (test_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags) == 0)
1773 return;
1750 /* Remove from global LRU init */ 1774 /* Remove from global LRU init */
1751 if (test_and_clear_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) { 1775 spin_lock(&nfs_access_lru_lock);
1752 spin_lock(&nfs_access_lru_lock); 1776 if (test_and_clear_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags))
1753 list_del_init(&NFS_I(inode)->access_cache_inode_lru); 1777 list_del_init(&NFS_I(inode)->access_cache_inode_lru);
1754 spin_unlock(&nfs_access_lru_lock);
1755 }
1756 1778
1757 spin_lock(&inode->i_lock); 1779 spin_lock(&inode->i_lock);
1758 /* This will release the spinlock */ 1780 __nfs_access_zap_cache(NFS_I(inode), &head);
1759 __nfs_access_zap_cache(inode); 1781 spin_unlock(&inode->i_lock);
1782 spin_unlock(&nfs_access_lru_lock);
1783 nfs_access_free_list(&head);
1760} 1784}
1761 1785
1762static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, struct rpc_cred *cred) 1786static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, struct rpc_cred *cred)
@@ -1807,8 +1831,8 @@ out_stale:
1807 nfs_access_free_entry(cache); 1831 nfs_access_free_entry(cache);
1808 return -ENOENT; 1832 return -ENOENT;
1809out_zap: 1833out_zap:
1810 /* This will release the spinlock */ 1834 spin_unlock(&inode->i_lock);
1811 __nfs_access_zap_cache(inode); 1835 nfs_access_zap_cache(inode);
1812 return -ENOENT; 1836 return -ENOENT;
1813} 1837}
1814 1838
@@ -1863,9 +1887,11 @@ static void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *s
1863 smp_mb__after_atomic_inc(); 1887 smp_mb__after_atomic_inc();
1864 1888
1865 /* Add inode to global LRU list */ 1889 /* Add inode to global LRU list */
1866 if (!test_and_set_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) { 1890 if (!test_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) {
1867 spin_lock(&nfs_access_lru_lock); 1891 spin_lock(&nfs_access_lru_lock);
1868 list_add_tail(&NFS_I(inode)->access_cache_inode_lru, &nfs_access_lru_list); 1892 if (!test_and_set_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags))
1893 list_add_tail(&NFS_I(inode)->access_cache_inode_lru,
1894 &nfs_access_lru_list);
1869 spin_unlock(&nfs_access_lru_lock); 1895 spin_unlock(&nfs_access_lru_lock);
1870 } 1896 }
1871} 1897}
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 0d289823e856..ad4cd31d6050 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -44,6 +44,7 @@
44#include <linux/file.h> 44#include <linux/file.h>
45#include <linux/pagemap.h> 45#include <linux/pagemap.h>
46#include <linux/kref.h> 46#include <linux/kref.h>
47#include <linux/slab.h>
47 48
48#include <linux/nfs_fs.h> 49#include <linux/nfs_fs.h>
49#include <linux/nfs_page.h> 50#include <linux/nfs_page.h>
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
index 3f0cd4dfddaf..76fd235d0024 100644
--- a/fs/nfs/dns_resolve.c
+++ b/fs/nfs/dns_resolve.c
@@ -9,6 +9,7 @@
9#include <linux/hash.h> 9#include <linux/hash.h>
10#include <linux/string.h> 10#include <linux/string.h>
11#include <linux/kmod.h> 11#include <linux/kmod.h>
12#include <linux/slab.h>
12#include <linux/module.h> 13#include <linux/module.h>
13#include <linux/socket.h> 14#include <linux/socket.h>
14#include <linux/seq_file.h> 15#include <linux/seq_file.h>
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index ae8d02294e46..36a5e74f51b4 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -24,9 +24,9 @@
24#include <linux/nfs_fs.h> 24#include <linux/nfs_fs.h>
25#include <linux/nfs_mount.h> 25#include <linux/nfs_mount.h>
26#include <linux/mm.h> 26#include <linux/mm.h>
27#include <linux/slab.h>
28#include <linux/pagemap.h> 27#include <linux/pagemap.h>
29#include <linux/aio.h> 28#include <linux/aio.h>
29#include <linux/gfp.h>
30 30
31#include <asm/uaccess.h> 31#include <asm/uaccess.h>
32#include <asm/system.h> 32#include <asm/system.h>
@@ -53,7 +53,7 @@ static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
53static ssize_t nfs_file_write(struct kiocb *, const struct iovec *iov, 53static ssize_t nfs_file_write(struct kiocb *, const struct iovec *iov,
54 unsigned long nr_segs, loff_t pos); 54 unsigned long nr_segs, loff_t pos);
55static int nfs_file_flush(struct file *, fl_owner_t id); 55static int nfs_file_flush(struct file *, fl_owner_t id);
56static int nfs_file_fsync(struct file *, struct dentry *dentry, int datasync); 56static int nfs_file_fsync(struct file *, int datasync);
57static int nfs_check_flags(int flags); 57static int nfs_check_flags(int flags);
58static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl); 58static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl);
59static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl); 59static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl);
@@ -161,14 +161,17 @@ static int nfs_revalidate_file_size(struct inode *inode, struct file *filp)
161 struct nfs_server *server = NFS_SERVER(inode); 161 struct nfs_server *server = NFS_SERVER(inode);
162 struct nfs_inode *nfsi = NFS_I(inode); 162 struct nfs_inode *nfsi = NFS_I(inode);
163 163
164 if (server->flags & NFS_MOUNT_NOAC) 164 if (nfs_have_delegated_attributes(inode))
165 goto force_reval; 165 goto out_noreval;
166
166 if (filp->f_flags & O_DIRECT) 167 if (filp->f_flags & O_DIRECT)
167 goto force_reval; 168 goto force_reval;
168 if (nfsi->npages != 0) 169 if (nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE)
169 return 0; 170 goto force_reval;
170 if (!(nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE) && !nfs_attribute_timeout(inode)) 171 if (nfs_attribute_timeout(inode))
171 return 0; 172 goto force_reval;
173out_noreval:
174 return 0;
172force_reval: 175force_reval:
173 return __nfs_revalidate_inode(server, inode); 176 return __nfs_revalidate_inode(server, inode);
174} 177}
@@ -319,8 +322,9 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
319 * whether any write errors occurred for this process. 322 * whether any write errors occurred for this process.
320 */ 323 */
321static int 324static int
322nfs_file_fsync(struct file *file, struct dentry *dentry, int datasync) 325nfs_file_fsync(struct file *file, int datasync)
323{ 326{
327 struct dentry *dentry = file->f_path.dentry;
324 struct nfs_open_context *ctx = nfs_file_open_context(file); 328 struct nfs_open_context *ctx = nfs_file_open_context(file);
325 struct inode *inode = dentry->d_inode; 329 struct inode *inode = dentry->d_inode;
326 330
@@ -491,7 +495,8 @@ static int nfs_release_page(struct page *page, gfp_t gfp)
491{ 495{
492 dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page); 496 dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
493 497
494 if (gfp & __GFP_WAIT) 498 /* Only do I/O if gfp is a superset of GFP_KERNEL */
499 if ((gfp & GFP_KERNEL) == GFP_KERNEL)
495 nfs_wb_page(page->mapping->host, page); 500 nfs_wb_page(page->mapping->host, page);
496 /* If PagePrivate() is set, then the page is not freeable */ 501 /* If PagePrivate() is set, then the page is not freeable */
497 if (PagePrivate(page)) 502 if (PagePrivate(page))
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index 237874f1af23..ce153a6b3aec 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -17,6 +17,7 @@
17#include <linux/nfs_fs_sb.h> 17#include <linux/nfs_fs_sb.h>
18#include <linux/in6.h> 18#include <linux/in6.h>
19#include <linux/seq_file.h> 19#include <linux/seq_file.h>
20#include <linux/slab.h>
20 21
21#include "internal.h" 22#include "internal.h"
22#include "iostat.h" 23#include "iostat.h"
@@ -466,7 +467,8 @@ int __nfs_readpages_from_fscache(struct nfs_open_context *ctx,
466 struct list_head *pages, 467 struct list_head *pages,
467 unsigned *nr_pages) 468 unsigned *nr_pages)
468{ 469{
469 int ret, npages = *nr_pages; 470 unsigned npages = *nr_pages;
471 int ret;
470 472
471 dfprintk(FSCACHE, "NFS: nfs_getpages_from_fscache (0x%p/%u/0x%p)\n", 473 dfprintk(FSCACHE, "NFS: nfs_getpages_from_fscache (0x%p/%u/0x%p)\n",
472 NFS_I(inode)->fscache, npages, inode); 474 NFS_I(inode)->fscache, npages, inode);
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index b35d2a616066..7428f7d6273b 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -78,159 +78,94 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh)
78{ 78{
79 struct nfs_server *server = NFS_SB(sb); 79 struct nfs_server *server = NFS_SB(sb);
80 struct nfs_fsinfo fsinfo; 80 struct nfs_fsinfo fsinfo;
81 struct nfs_fattr fattr; 81 struct dentry *ret;
82 struct dentry *mntroot;
83 struct inode *inode; 82 struct inode *inode;
84 int error; 83 int error;
85 84
86 /* get the actual root for this mount */ 85 /* get the actual root for this mount */
87 fsinfo.fattr = &fattr; 86 fsinfo.fattr = nfs_alloc_fattr();
87 if (fsinfo.fattr == NULL)
88 return ERR_PTR(-ENOMEM);
88 89
89 error = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo); 90 error = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo);
90 if (error < 0) { 91 if (error < 0) {
91 dprintk("nfs_get_root: getattr error = %d\n", -error); 92 dprintk("nfs_get_root: getattr error = %d\n", -error);
92 return ERR_PTR(error); 93 ret = ERR_PTR(error);
94 goto out;
93 } 95 }
94 96
95 inode = nfs_fhget(sb, mntfh, fsinfo.fattr); 97 inode = nfs_fhget(sb, mntfh, fsinfo.fattr);
96 if (IS_ERR(inode)) { 98 if (IS_ERR(inode)) {
97 dprintk("nfs_get_root: get root inode failed\n"); 99 dprintk("nfs_get_root: get root inode failed\n");
98 return ERR_CAST(inode); 100 ret = ERR_CAST(inode);
101 goto out;
99 } 102 }
100 103
101 error = nfs_superblock_set_dummy_root(sb, inode); 104 error = nfs_superblock_set_dummy_root(sb, inode);
102 if (error != 0) 105 if (error != 0) {
103 return ERR_PTR(error); 106 ret = ERR_PTR(error);
107 goto out;
108 }
104 109
105 /* root dentries normally start off anonymous and get spliced in later 110 /* root dentries normally start off anonymous and get spliced in later
106 * if the dentry tree reaches them; however if the dentry already 111 * if the dentry tree reaches them; however if the dentry already
107 * exists, we'll pick it up at this point and use it as the root 112 * exists, we'll pick it up at this point and use it as the root
108 */ 113 */
109 mntroot = d_obtain_alias(inode); 114 ret = d_obtain_alias(inode);
110 if (IS_ERR(mntroot)) { 115 if (IS_ERR(ret)) {
111 dprintk("nfs_get_root: get root dentry failed\n"); 116 dprintk("nfs_get_root: get root dentry failed\n");
112 return mntroot; 117 goto out;
113 } 118 }
114 119
115 security_d_instantiate(mntroot, inode); 120 security_d_instantiate(ret, inode);
116
117 if (!mntroot->d_op)
118 mntroot->d_op = server->nfs_client->rpc_ops->dentry_ops;
119 121
120 return mntroot; 122 if (ret->d_op == NULL)
123 ret->d_op = server->nfs_client->rpc_ops->dentry_ops;
124out:
125 nfs_free_fattr(fsinfo.fattr);
126 return ret;
121} 127}
122 128
123#ifdef CONFIG_NFS_V4 129#ifdef CONFIG_NFS_V4
124 130
125/* 131int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh)
126 * Do a simple pathwalk from the root FH of the server to the nominated target
127 * of the mountpoint
128 * - give error on symlinks
129 * - give error on ".." occurring in the path
130 * - follow traversals
131 */
132int nfs4_path_walk(struct nfs_server *server,
133 struct nfs_fh *mntfh,
134 const char *path)
135{ 132{
136 struct nfs_fsinfo fsinfo; 133 struct nfs_fsinfo fsinfo;
137 struct nfs_fattr fattr; 134 int ret = -ENOMEM;
138 struct nfs_fh lastfh;
139 struct qstr name;
140 int ret;
141 135
142 dprintk("--> nfs4_path_walk(,,%s)\n", path); 136 dprintk("--> nfs4_get_rootfh()\n");
143 137
144 fsinfo.fattr = &fattr; 138 fsinfo.fattr = nfs_alloc_fattr();
145 nfs_fattr_init(&fattr); 139 if (fsinfo.fattr == NULL)
146 140 goto out;
147 /* Eat leading slashes */
148 while (*path == '/')
149 path++;
150 141
151 /* Start by getting the root filehandle from the server */ 142 /* Start by getting the root filehandle from the server */
152 ret = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo); 143 ret = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo);
153 if (ret < 0) { 144 if (ret < 0) {
154 dprintk("nfs4_get_root: getroot error = %d\n", -ret); 145 dprintk("nfs4_get_rootfh: getroot error = %d\n", -ret);
155 return ret; 146 goto out;
156 } 147 }
157 148
158 if (!S_ISDIR(fattr.mode)) { 149 if (!(fsinfo.fattr->valid & NFS_ATTR_FATTR_MODE)
159 printk(KERN_ERR "nfs4_get_root:" 150 || !S_ISDIR(fsinfo.fattr->mode)) {
151 printk(KERN_ERR "nfs4_get_rootfh:"
160 " getroot encountered non-directory\n"); 152 " getroot encountered non-directory\n");
161 return -ENOTDIR; 153 ret = -ENOTDIR;
154 goto out;
162 } 155 }
163 156
164 /* FIXME: It is quite valid for the server to return a referral here */ 157 if (fsinfo.fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) {
165 if (fattr.valid & NFS_ATTR_FATTR_V4_REFERRAL) { 158 printk(KERN_ERR "nfs4_get_rootfh:"
166 printk(KERN_ERR "nfs4_get_root:"
167 " getroot obtained referral\n"); 159 " getroot obtained referral\n");
168 return -EREMOTE; 160 ret = -EREMOTE;
169 } 161 goto out;
170
171next_component:
172 dprintk("Next: %s\n", path);
173
174 /* extract the next bit of the path */
175 if (!*path)
176 goto path_walk_complete;
177
178 name.name = path;
179 while (*path && *path != '/')
180 path++;
181 name.len = path - (const char *) name.name;
182
183 if (name.len > NFS4_MAXNAMLEN)
184 return -ENAMETOOLONG;
185
186eat_dot_dir:
187 while (*path == '/')
188 path++;
189
190 if (path[0] == '.' && (path[1] == '/' || !path[1])) {
191 path += 2;
192 goto eat_dot_dir;
193 }
194
195 /* FIXME: Why shouldn't the user be able to use ".." in the path? */
196 if (path[0] == '.' && path[1] == '.' && (path[2] == '/' || !path[2])
197 ) {
198 printk(KERN_ERR "nfs4_get_root:"
199 " Mount path contains reference to \"..\"\n");
200 return -EINVAL;
201 } 162 }
202 163
203 /* lookup the next FH in the sequence */ 164 memcpy(&server->fsid, &fsinfo.fattr->fsid, sizeof(server->fsid));
204 memcpy(&lastfh, mntfh, sizeof(lastfh)); 165out:
205 166 nfs_free_fattr(fsinfo.fattr);
206 dprintk("LookupFH: %*.*s [%s]\n", name.len, name.len, name.name, path); 167 dprintk("<-- nfs4_get_rootfh() = %d\n", ret);
207 168 return ret;
208 ret = server->nfs_client->rpc_ops->lookupfh(server, &lastfh, &name,
209 mntfh, &fattr);
210 if (ret < 0) {
211 dprintk("nfs4_get_root: getroot error = %d\n", -ret);
212 return ret;
213 }
214
215 if (!S_ISDIR(fattr.mode)) {
216 printk(KERN_ERR "nfs4_get_root:"
217 " lookupfh encountered non-directory\n");
218 return -ENOTDIR;
219 }
220
221 /* FIXME: Referrals are quite valid here too */
222 if (fattr.valid & NFS_ATTR_FATTR_V4_REFERRAL) {
223 printk(KERN_ERR "nfs4_get_root:"
224 " lookupfh obtained referral\n");
225 return -EREMOTE;
226 }
227
228 goto next_component;
229
230path_walk_complete:
231 memcpy(&server->fsid, &fattr.fsid, sizeof(server->fsid));
232 dprintk("<-- nfs4_path_walk() = 0\n");
233 return 0;
234} 169}
235 170
236/* 171/*
@@ -239,8 +174,8 @@ path_walk_complete:
239struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh) 174struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh)
240{ 175{
241 struct nfs_server *server = NFS_SB(sb); 176 struct nfs_server *server = NFS_SB(sb);
242 struct nfs_fattr fattr; 177 struct nfs_fattr *fattr = NULL;
243 struct dentry *mntroot; 178 struct dentry *ret;
244 struct inode *inode; 179 struct inode *inode;
245 int error; 180 int error;
246 181
@@ -254,40 +189,50 @@ struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh)
254 return ERR_PTR(error); 189 return ERR_PTR(error);
255 } 190 }
256 191
192 fattr = nfs_alloc_fattr();
193 if (fattr == NULL)
194 return ERR_PTR(-ENOMEM);;
195
257 /* get the actual root for this mount */ 196 /* get the actual root for this mount */
258 error = server->nfs_client->rpc_ops->getattr(server, mntfh, &fattr); 197 error = server->nfs_client->rpc_ops->getattr(server, mntfh, fattr);
259 if (error < 0) { 198 if (error < 0) {
260 dprintk("nfs_get_root: getattr error = %d\n", -error); 199 dprintk("nfs_get_root: getattr error = %d\n", -error);
261 return ERR_PTR(error); 200 ret = ERR_PTR(error);
201 goto out;
262 } 202 }
263 203
264 inode = nfs_fhget(sb, mntfh, &fattr); 204 inode = nfs_fhget(sb, mntfh, fattr);
265 if (IS_ERR(inode)) { 205 if (IS_ERR(inode)) {
266 dprintk("nfs_get_root: get root inode failed\n"); 206 dprintk("nfs_get_root: get root inode failed\n");
267 return ERR_CAST(inode); 207 ret = ERR_CAST(inode);
208 goto out;
268 } 209 }
269 210
270 error = nfs_superblock_set_dummy_root(sb, inode); 211 error = nfs_superblock_set_dummy_root(sb, inode);
271 if (error != 0) 212 if (error != 0) {
272 return ERR_PTR(error); 213 ret = ERR_PTR(error);
214 goto out;
215 }
273 216
274 /* root dentries normally start off anonymous and get spliced in later 217 /* root dentries normally start off anonymous and get spliced in later
275 * if the dentry tree reaches them; however if the dentry already 218 * if the dentry tree reaches them; however if the dentry already
276 * exists, we'll pick it up at this point and use it as the root 219 * exists, we'll pick it up at this point and use it as the root
277 */ 220 */
278 mntroot = d_obtain_alias(inode); 221 ret = d_obtain_alias(inode);
279 if (IS_ERR(mntroot)) { 222 if (IS_ERR(ret)) {
280 dprintk("nfs_get_root: get root dentry failed\n"); 223 dprintk("nfs_get_root: get root dentry failed\n");
281 return mntroot; 224 goto out;
282 } 225 }
283 226
284 security_d_instantiate(mntroot, inode); 227 security_d_instantiate(ret, inode);
285 228
286 if (!mntroot->d_op) 229 if (ret->d_op == NULL)
287 mntroot->d_op = server->nfs_client->rpc_ops->dentry_ops; 230 ret->d_op = server->nfs_client->rpc_ops->dentry_ops;
288 231
232out:
233 nfs_free_fattr(fattr);
289 dprintk("<-- nfs4_get_root()\n"); 234 dprintk("<-- nfs4_get_root()\n");
290 return mntroot; 235 return ret;
291} 236}
292 237
293#endif /* CONFIG_NFS_V4 */ 238#endif /* CONFIG_NFS_V4 */
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index e358df75a6ad..099b3518feea 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -36,6 +36,7 @@
36#include <linux/vfs.h> 36#include <linux/vfs.h>
37#include <linux/inet.h> 37#include <linux/inet.h>
38#include <linux/nfs_xdr.h> 38#include <linux/nfs_xdr.h>
39#include <linux/slab.h>
39 40
40#include <asm/system.h> 41#include <asm/system.h>
41#include <asm/uaccess.h> 42#include <asm/uaccess.h>
@@ -392,8 +393,8 @@ int
392nfs_setattr(struct dentry *dentry, struct iattr *attr) 393nfs_setattr(struct dentry *dentry, struct iattr *attr)
393{ 394{
394 struct inode *inode = dentry->d_inode; 395 struct inode *inode = dentry->d_inode;
395 struct nfs_fattr fattr; 396 struct nfs_fattr *fattr;
396 int error; 397 int error = -ENOMEM;
397 398
398 nfs_inc_stats(inode, NFSIOS_VFSSETATTR); 399 nfs_inc_stats(inode, NFSIOS_VFSSETATTR);
399 400
@@ -416,14 +417,20 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
416 filemap_write_and_wait(inode->i_mapping); 417 filemap_write_and_wait(inode->i_mapping);
417 nfs_wb_all(inode); 418 nfs_wb_all(inode);
418 } 419 }
420
421 fattr = nfs_alloc_fattr();
422 if (fattr == NULL)
423 goto out;
419 /* 424 /*
420 * Return any delegations if we're going to change ACLs 425 * Return any delegations if we're going to change ACLs
421 */ 426 */
422 if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) 427 if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0)
423 nfs_inode_return_delegation(inode); 428 nfs_inode_return_delegation(inode);
424 error = NFS_PROTO(inode)->setattr(dentry, &fattr, attr); 429 error = NFS_PROTO(inode)->setattr(dentry, fattr, attr);
425 if (error == 0) 430 if (error == 0)
426 nfs_refresh_inode(inode, &fattr); 431 nfs_refresh_inode(inode, fattr);
432 nfs_free_fattr(fattr);
433out:
427 return error; 434 return error;
428} 435}
429 436
@@ -622,10 +629,10 @@ struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_c
622 list_for_each_entry(pos, &nfsi->open_files, list) { 629 list_for_each_entry(pos, &nfsi->open_files, list) {
623 if (cred != NULL && pos->cred != cred) 630 if (cred != NULL && pos->cred != cred)
624 continue; 631 continue;
625 if ((pos->mode & mode) == mode) { 632 if ((pos->mode & (FMODE_READ|FMODE_WRITE)) != mode)
626 ctx = get_nfs_open_context(pos); 633 continue;
627 break; 634 ctx = get_nfs_open_context(pos);
628 } 635 break;
629 } 636 }
630 spin_unlock(&inode->i_lock); 637 spin_unlock(&inode->i_lock);
631 return ctx; 638 return ctx;
@@ -681,7 +688,7 @@ int
681__nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) 688__nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
682{ 689{
683 int status = -ESTALE; 690 int status = -ESTALE;
684 struct nfs_fattr fattr; 691 struct nfs_fattr *fattr = NULL;
685 struct nfs_inode *nfsi = NFS_I(inode); 692 struct nfs_inode *nfsi = NFS_I(inode);
686 693
687 dfprintk(PAGECACHE, "NFS: revalidating (%s/%Ld)\n", 694 dfprintk(PAGECACHE, "NFS: revalidating (%s/%Ld)\n",
@@ -692,8 +699,13 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
692 if (NFS_STALE(inode)) 699 if (NFS_STALE(inode))
693 goto out; 700 goto out;
694 701
702 status = -ENOMEM;
703 fattr = nfs_alloc_fattr();
704 if (fattr == NULL)
705 goto out;
706
695 nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE); 707 nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
696 status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), &fattr); 708 status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr);
697 if (status != 0) { 709 if (status != 0) {
698 dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) getattr failed, error=%d\n", 710 dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) getattr failed, error=%d\n",
699 inode->i_sb->s_id, 711 inode->i_sb->s_id,
@@ -706,7 +718,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
706 goto out; 718 goto out;
707 } 719 }
708 720
709 status = nfs_refresh_inode(inode, &fattr); 721 status = nfs_refresh_inode(inode, fattr);
710 if (status) { 722 if (status) {
711 dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) refresh failed, error=%d\n", 723 dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) refresh failed, error=%d\n",
712 inode->i_sb->s_id, 724 inode->i_sb->s_id,
@@ -722,6 +734,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
722 (long long)NFS_FILEID(inode)); 734 (long long)NFS_FILEID(inode));
723 735
724 out: 736 out:
737 nfs_free_fattr(fattr);
725 return status; 738 return status;
726} 739}
727 740
@@ -729,9 +742,14 @@ int nfs_attribute_timeout(struct inode *inode)
729{ 742{
730 struct nfs_inode *nfsi = NFS_I(inode); 743 struct nfs_inode *nfsi = NFS_I(inode);
731 744
745 return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo);
746}
747
748static int nfs_attribute_cache_expired(struct inode *inode)
749{
732 if (nfs_have_delegated_attributes(inode)) 750 if (nfs_have_delegated_attributes(inode))
733 return 0; 751 return 0;
734 return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo); 752 return nfs_attribute_timeout(inode);
735} 753}
736 754
737/** 755/**
@@ -744,7 +762,7 @@ int nfs_attribute_timeout(struct inode *inode)
744int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) 762int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
745{ 763{
746 if (!(NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATTR) 764 if (!(NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATTR)
747 && !nfs_attribute_timeout(inode)) 765 && !nfs_attribute_cache_expired(inode))
748 return NFS_STALE(inode) ? -ESTALE : 0; 766 return NFS_STALE(inode) ? -ESTALE : 0;
749 return __nfs_revalidate_inode(server, inode); 767 return __nfs_revalidate_inode(server, inode);
750} 768}
@@ -781,7 +799,8 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
781 int ret = 0; 799 int ret = 0;
782 800
783 if ((nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE) 801 if ((nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE)
784 || nfs_attribute_timeout(inode) || NFS_STALE(inode)) { 802 || nfs_attribute_cache_expired(inode)
803 || NFS_STALE(inode)) {
785 ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode); 804 ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
786 if (ret < 0) 805 if (ret < 0)
787 goto out; 806 goto out;
@@ -915,6 +934,26 @@ void nfs_fattr_init(struct nfs_fattr *fattr)
915 fattr->gencount = nfs_inc_attr_generation_counter(); 934 fattr->gencount = nfs_inc_attr_generation_counter();
916} 935}
917 936
937struct nfs_fattr *nfs_alloc_fattr(void)
938{
939 struct nfs_fattr *fattr;
940
941 fattr = kmalloc(sizeof(*fattr), GFP_NOFS);
942 if (fattr != NULL)
943 nfs_fattr_init(fattr);
944 return fattr;
945}
946
947struct nfs_fh *nfs_alloc_fhandle(void)
948{
949 struct nfs_fh *fh;
950
951 fh = kmalloc(sizeof(struct nfs_fh), GFP_NOFS);
952 if (fh != NULL)
953 fh->size = 0;
954 return fh;
955}
956
918/** 957/**
919 * nfs_inode_attrs_need_update - check if the inode attributes need updating 958 * nfs_inode_attrs_need_update - check if the inode attributes need updating
920 * @inode - pointer to inode 959 * @inode - pointer to inode
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 11f82f03c5de..d8bd619e386c 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -244,9 +244,7 @@ extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *);
244#ifdef CONFIG_NFS_V4 244#ifdef CONFIG_NFS_V4
245extern struct dentry *nfs4_get_root(struct super_block *, struct nfs_fh *); 245extern struct dentry *nfs4_get_root(struct super_block *, struct nfs_fh *);
246 246
247extern int nfs4_path_walk(struct nfs_server *server, 247extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh);
248 struct nfs_fh *mntfh,
249 const char *path);
250#endif 248#endif
251 249
252/* read.c */ 250/* read.c */
diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h
index 1d8d5c813b01..c5832487c456 100644
--- a/fs/nfs/iostat.h
+++ b/fs/nfs/iostat.h
@@ -36,14 +36,14 @@ static inline void nfs_inc_stats(const struct inode *inode,
36 36
37static inline void nfs_add_server_stats(const struct nfs_server *server, 37static inline void nfs_add_server_stats(const struct nfs_server *server,
38 enum nfs_stat_bytecounters stat, 38 enum nfs_stat_bytecounters stat,
39 unsigned long addend) 39 long addend)
40{ 40{
41 this_cpu_add(server->io_stats->bytes[stat], addend); 41 this_cpu_add(server->io_stats->bytes[stat], addend);
42} 42}
43 43
44static inline void nfs_add_stats(const struct inode *inode, 44static inline void nfs_add_stats(const struct inode *inode,
45 enum nfs_stat_bytecounters stat, 45 enum nfs_stat_bytecounters stat,
46 unsigned long addend) 46 long addend)
47{ 47{
48 nfs_add_server_stats(NFS_SERVER(inode), stat, addend); 48 nfs_add_server_stats(NFS_SERVER(inode), stat, addend);
49} 49}
@@ -51,7 +51,7 @@ static inline void nfs_add_stats(const struct inode *inode,
51#ifdef CONFIG_NFS_FSCACHE 51#ifdef CONFIG_NFS_FSCACHE
52static inline void nfs_add_fscache_stats(struct inode *inode, 52static inline void nfs_add_fscache_stats(struct inode *inode,
53 enum nfs_stat_fscachecounters stat, 53 enum nfs_stat_fscachecounters stat,
54 unsigned long addend) 54 long addend)
55{ 55{
56 this_cpu_add(NFS_SERVER(inode)->io_stats->fscache[stat], addend); 56 this_cpu_add(NFS_SERVER(inode)->io_stats->fscache[stat], addend);
57} 57}
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 40c766782891..db6aa3673cf3 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -8,6 +8,7 @@
8 */ 8 */
9 9
10#include <linux/dcache.h> 10#include <linux/dcache.h>
11#include <linux/gfp.h>
11#include <linux/mount.h> 12#include <linux/mount.h>
12#include <linux/namei.h> 13#include <linux/namei.h>
13#include <linux/nfs_fs.h> 14#include <linux/nfs_fs.h>
@@ -104,8 +105,8 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
104 struct vfsmount *mnt; 105 struct vfsmount *mnt;
105 struct nfs_server *server = NFS_SERVER(dentry->d_inode); 106 struct nfs_server *server = NFS_SERVER(dentry->d_inode);
106 struct dentry *parent; 107 struct dentry *parent;
107 struct nfs_fh fh; 108 struct nfs_fh *fh = NULL;
108 struct nfs_fattr fattr; 109 struct nfs_fattr *fattr = NULL;
109 int err; 110 int err;
110 111
111 dprintk("--> nfs_follow_mountpoint()\n"); 112 dprintk("--> nfs_follow_mountpoint()\n");
@@ -114,6 +115,12 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
114 if (IS_ROOT(dentry)) 115 if (IS_ROOT(dentry))
115 goto out_err; 116 goto out_err;
116 117
118 err = -ENOMEM;
119 fh = nfs_alloc_fhandle();
120 fattr = nfs_alloc_fattr();
121 if (fh == NULL || fattr == NULL)
122 goto out_err;
123
117 dprintk("%s: enter\n", __func__); 124 dprintk("%s: enter\n", __func__);
118 dput(nd->path.dentry); 125 dput(nd->path.dentry);
119 nd->path.dentry = dget(dentry); 126 nd->path.dentry = dget(dentry);
@@ -122,16 +129,16 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
122 parent = dget_parent(nd->path.dentry); 129 parent = dget_parent(nd->path.dentry);
123 err = server->nfs_client->rpc_ops->lookup(parent->d_inode, 130 err = server->nfs_client->rpc_ops->lookup(parent->d_inode,
124 &nd->path.dentry->d_name, 131 &nd->path.dentry->d_name,
125 &fh, &fattr); 132 fh, fattr);
126 dput(parent); 133 dput(parent);
127 if (err != 0) 134 if (err != 0)
128 goto out_err; 135 goto out_err;
129 136
130 if (fattr.valid & NFS_ATTR_FATTR_V4_REFERRAL) 137 if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
131 mnt = nfs_do_refmount(nd->path.mnt, nd->path.dentry); 138 mnt = nfs_do_refmount(nd->path.mnt, nd->path.dentry);
132 else 139 else
133 mnt = nfs_do_submount(nd->path.mnt, nd->path.dentry, &fh, 140 mnt = nfs_do_submount(nd->path.mnt, nd->path.dentry, fh,
134 &fattr); 141 fattr);
135 err = PTR_ERR(mnt); 142 err = PTR_ERR(mnt);
136 if (IS_ERR(mnt)) 143 if (IS_ERR(mnt))
137 goto out_err; 144 goto out_err;
@@ -150,6 +157,8 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
150 nd->path.dentry = dget(mnt->mnt_root); 157 nd->path.dentry = dget(mnt->mnt_root);
151 schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout); 158 schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout);
152out: 159out:
160 nfs_free_fattr(fattr);
161 nfs_free_fhandle(fh);
153 dprintk("%s: done, returned %d\n", __func__, err); 162 dprintk("%s: done, returned %d\n", __func__, err);
154 163
155 dprintk("<-- nfs_follow_mountpoint() = %d\n", err); 164 dprintk("<-- nfs_follow_mountpoint() = %d\n", err);
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 7bc2da8efd4a..81cf14257916 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -12,7 +12,6 @@
12#include <linux/param.h> 12#include <linux/param.h>
13#include <linux/time.h> 13#include <linux/time.h>
14#include <linux/mm.h> 14#include <linux/mm.h>
15#include <linux/slab.h>
16#include <linux/errno.h> 15#include <linux/errno.h>
17#include <linux/string.h> 16#include <linux/string.h>
18#include <linux/in.h> 17#include <linux/in.h>
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index bac60515a4b3..9f88c5f4c7e2 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -1,4 +1,5 @@
1#include <linux/fs.h> 1#include <linux/fs.h>
2#include <linux/gfp.h>
2#include <linux/nfs.h> 3#include <linux/nfs.h>
3#include <linux/nfs3.h> 4#include <linux/nfs3.h>
4#include <linux/nfs_fs.h> 5#include <linux/nfs_fs.h>
@@ -184,7 +185,6 @@ static void nfs3_cache_acls(struct inode *inode, struct posix_acl *acl,
184struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type) 185struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
185{ 186{
186 struct nfs_server *server = NFS_SERVER(inode); 187 struct nfs_server *server = NFS_SERVER(inode);
187 struct nfs_fattr fattr;
188 struct page *pages[NFSACL_MAXPAGES] = { }; 188 struct page *pages[NFSACL_MAXPAGES] = { };
189 struct nfs3_getaclargs args = { 189 struct nfs3_getaclargs args = {
190 .fh = NFS_FH(inode), 190 .fh = NFS_FH(inode),
@@ -192,7 +192,7 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
192 .pages = pages, 192 .pages = pages,
193 }; 193 };
194 struct nfs3_getaclres res = { 194 struct nfs3_getaclres res = {
195 .fattr = &fattr, 195 0
196 }; 196 };
197 struct rpc_message msg = { 197 struct rpc_message msg = {
198 .rpc_argp = &args, 198 .rpc_argp = &args,
@@ -227,7 +227,10 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
227 227
228 dprintk("NFS call getacl\n"); 228 dprintk("NFS call getacl\n");
229 msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_GETACL]; 229 msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_GETACL];
230 nfs_fattr_init(&fattr); 230 res.fattr = nfs_alloc_fattr();
231 if (res.fattr == NULL)
232 return ERR_PTR(-ENOMEM);
233
231 status = rpc_call_sync(server->client_acl, &msg, 0); 234 status = rpc_call_sync(server->client_acl, &msg, 0);
232 dprintk("NFS reply getacl: %d\n", status); 235 dprintk("NFS reply getacl: %d\n", status);
233 236
@@ -237,7 +240,7 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
237 240
238 switch (status) { 241 switch (status) {
239 case 0: 242 case 0:
240 status = nfs_refresh_inode(inode, &fattr); 243 status = nfs_refresh_inode(inode, res.fattr);
241 break; 244 break;
242 case -EPFNOSUPPORT: 245 case -EPFNOSUPPORT:
243 case -EPROTONOSUPPORT: 246 case -EPROTONOSUPPORT:
@@ -277,6 +280,7 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
277getout: 280getout:
278 posix_acl_release(res.acl_access); 281 posix_acl_release(res.acl_access);
279 posix_acl_release(res.acl_default); 282 posix_acl_release(res.acl_default);
283 nfs_free_fattr(res.fattr);
280 284
281 if (status != 0) { 285 if (status != 0) {
282 posix_acl_release(acl); 286 posix_acl_release(acl);
@@ -289,7 +293,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
289 struct posix_acl *dfacl) 293 struct posix_acl *dfacl)
290{ 294{
291 struct nfs_server *server = NFS_SERVER(inode); 295 struct nfs_server *server = NFS_SERVER(inode);
292 struct nfs_fattr fattr; 296 struct nfs_fattr *fattr;
293 struct page *pages[NFSACL_MAXPAGES]; 297 struct page *pages[NFSACL_MAXPAGES];
294 struct nfs3_setaclargs args = { 298 struct nfs3_setaclargs args = {
295 .inode = inode, 299 .inode = inode,
@@ -334,8 +338,13 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
334 } 338 }
335 339
336 dprintk("NFS call setacl\n"); 340 dprintk("NFS call setacl\n");
341 status = -ENOMEM;
342 fattr = nfs_alloc_fattr();
343 if (fattr == NULL)
344 goto out_freepages;
345
337 msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_SETACL]; 346 msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_SETACL];
338 nfs_fattr_init(&fattr); 347 msg.rpc_resp = fattr;
339 status = rpc_call_sync(server->client_acl, &msg, 0); 348 status = rpc_call_sync(server->client_acl, &msg, 0);
340 nfs_access_zap_cache(inode); 349 nfs_access_zap_cache(inode);
341 nfs_zap_acl_cache(inode); 350 nfs_zap_acl_cache(inode);
@@ -343,7 +352,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
343 352
344 switch (status) { 353 switch (status) {
345 case 0: 354 case 0:
346 status = nfs_refresh_inode(inode, &fattr); 355 status = nfs_refresh_inode(inode, fattr);
347 nfs3_cache_acls(inode, acl, dfacl); 356 nfs3_cache_acls(inode, acl, dfacl);
348 break; 357 break;
349 case -EPFNOSUPPORT: 358 case -EPFNOSUPPORT:
@@ -354,6 +363,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
354 case -ENOTSUPP: 363 case -ENOTSUPP:
355 status = -EOPNOTSUPP; 364 status = -EOPNOTSUPP;
356 } 365 }
366 nfs_free_fattr(fattr);
357out_freepages: 367out_freepages:
358 while (args.npages != 0) { 368 while (args.npages != 0) {
359 args.npages--; 369 args.npages--;
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 24992f0a29f2..fabb4f2849a1 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -10,6 +10,7 @@
10#include <linux/errno.h> 10#include <linux/errno.h>
11#include <linux/string.h> 11#include <linux/string.h>
12#include <linux/sunrpc/clnt.h> 12#include <linux/sunrpc/clnt.h>
13#include <linux/slab.h>
13#include <linux/nfs.h> 14#include <linux/nfs.h>
14#include <linux/nfs3.h> 15#include <linux/nfs3.h>
15#include <linux/nfs_fs.h> 16#include <linux/nfs_fs.h>
@@ -143,14 +144,12 @@ static int
143nfs3_proc_lookup(struct inode *dir, struct qstr *name, 144nfs3_proc_lookup(struct inode *dir, struct qstr *name,
144 struct nfs_fh *fhandle, struct nfs_fattr *fattr) 145 struct nfs_fh *fhandle, struct nfs_fattr *fattr)
145{ 146{
146 struct nfs_fattr dir_attr;
147 struct nfs3_diropargs arg = { 147 struct nfs3_diropargs arg = {
148 .fh = NFS_FH(dir), 148 .fh = NFS_FH(dir),
149 .name = name->name, 149 .name = name->name,
150 .len = name->len 150 .len = name->len
151 }; 151 };
152 struct nfs3_diropres res = { 152 struct nfs3_diropres res = {
153 .dir_attr = &dir_attr,
154 .fh = fhandle, 153 .fh = fhandle,
155 .fattr = fattr 154 .fattr = fattr
156 }; 155 };
@@ -162,29 +161,30 @@ nfs3_proc_lookup(struct inode *dir, struct qstr *name,
162 int status; 161 int status;
163 162
164 dprintk("NFS call lookup %s\n", name->name); 163 dprintk("NFS call lookup %s\n", name->name);
165 nfs_fattr_init(&dir_attr); 164 res.dir_attr = nfs_alloc_fattr();
165 if (res.dir_attr == NULL)
166 return -ENOMEM;
167
166 nfs_fattr_init(fattr); 168 nfs_fattr_init(fattr);
167 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 169 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
168 nfs_refresh_inode(dir, &dir_attr); 170 nfs_refresh_inode(dir, res.dir_attr);
169 if (status >= 0 && !(fattr->valid & NFS_ATTR_FATTR)) { 171 if (status >= 0 && !(fattr->valid & NFS_ATTR_FATTR)) {
170 msg.rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR]; 172 msg.rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR];
171 msg.rpc_argp = fhandle; 173 msg.rpc_argp = fhandle;
172 msg.rpc_resp = fattr; 174 msg.rpc_resp = fattr;
173 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 175 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
174 } 176 }
177 nfs_free_fattr(res.dir_attr);
175 dprintk("NFS reply lookup: %d\n", status); 178 dprintk("NFS reply lookup: %d\n", status);
176 return status; 179 return status;
177} 180}
178 181
179static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry) 182static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry)
180{ 183{
181 struct nfs_fattr fattr;
182 struct nfs3_accessargs arg = { 184 struct nfs3_accessargs arg = {
183 .fh = NFS_FH(inode), 185 .fh = NFS_FH(inode),
184 }; 186 };
185 struct nfs3_accessres res = { 187 struct nfs3_accessres res;
186 .fattr = &fattr,
187 };
188 struct rpc_message msg = { 188 struct rpc_message msg = {
189 .rpc_proc = &nfs3_procedures[NFS3PROC_ACCESS], 189 .rpc_proc = &nfs3_procedures[NFS3PROC_ACCESS],
190 .rpc_argp = &arg, 190 .rpc_argp = &arg,
@@ -192,7 +192,7 @@ static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry)
192 .rpc_cred = entry->cred, 192 .rpc_cred = entry->cred,
193 }; 193 };
194 int mode = entry->mask; 194 int mode = entry->mask;
195 int status; 195 int status = -ENOMEM;
196 196
197 dprintk("NFS call access\n"); 197 dprintk("NFS call access\n");
198 198
@@ -209,9 +209,13 @@ static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry)
209 if (mode & MAY_EXEC) 209 if (mode & MAY_EXEC)
210 arg.access |= NFS3_ACCESS_EXECUTE; 210 arg.access |= NFS3_ACCESS_EXECUTE;
211 } 211 }
212 nfs_fattr_init(&fattr); 212
213 res.fattr = nfs_alloc_fattr();
214 if (res.fattr == NULL)
215 goto out;
216
213 status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); 217 status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
214 nfs_refresh_inode(inode, &fattr); 218 nfs_refresh_inode(inode, res.fattr);
215 if (status == 0) { 219 if (status == 0) {
216 entry->mask = 0; 220 entry->mask = 0;
217 if (res.access & NFS3_ACCESS_READ) 221 if (res.access & NFS3_ACCESS_READ)
@@ -221,6 +225,8 @@ static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry)
221 if (res.access & (NFS3_ACCESS_LOOKUP|NFS3_ACCESS_EXECUTE)) 225 if (res.access & (NFS3_ACCESS_LOOKUP|NFS3_ACCESS_EXECUTE))
222 entry->mask |= MAY_EXEC; 226 entry->mask |= MAY_EXEC;
223 } 227 }
228 nfs_free_fattr(res.fattr);
229out:
224 dprintk("NFS reply access: %d\n", status); 230 dprintk("NFS reply access: %d\n", status);
225 return status; 231 return status;
226} 232}
@@ -228,7 +234,7 @@ static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry)
228static int nfs3_proc_readlink(struct inode *inode, struct page *page, 234static int nfs3_proc_readlink(struct inode *inode, struct page *page,
229 unsigned int pgbase, unsigned int pglen) 235 unsigned int pgbase, unsigned int pglen)
230{ 236{
231 struct nfs_fattr fattr; 237 struct nfs_fattr *fattr;
232 struct nfs3_readlinkargs args = { 238 struct nfs3_readlinkargs args = {
233 .fh = NFS_FH(inode), 239 .fh = NFS_FH(inode),
234 .pgbase = pgbase, 240 .pgbase = pgbase,
@@ -238,14 +244,19 @@ static int nfs3_proc_readlink(struct inode *inode, struct page *page,
238 struct rpc_message msg = { 244 struct rpc_message msg = {
239 .rpc_proc = &nfs3_procedures[NFS3PROC_READLINK], 245 .rpc_proc = &nfs3_procedures[NFS3PROC_READLINK],
240 .rpc_argp = &args, 246 .rpc_argp = &args,
241 .rpc_resp = &fattr,
242 }; 247 };
243 int status; 248 int status = -ENOMEM;
244 249
245 dprintk("NFS call readlink\n"); 250 dprintk("NFS call readlink\n");
246 nfs_fattr_init(&fattr); 251 fattr = nfs_alloc_fattr();
252 if (fattr == NULL)
253 goto out;
254 msg.rpc_resp = fattr;
255
247 status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); 256 status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
248 nfs_refresh_inode(inode, &fattr); 257 nfs_refresh_inode(inode, fattr);
258 nfs_free_fattr(fattr);
259out:
249 dprintk("NFS reply readlink: %d\n", status); 260 dprintk("NFS reply readlink: %d\n", status);
250 return status; 261 return status;
251} 262}
@@ -395,12 +406,17 @@ nfs3_proc_remove(struct inode *dir, struct qstr *name)
395 .rpc_argp = &arg, 406 .rpc_argp = &arg,
396 .rpc_resp = &res, 407 .rpc_resp = &res,
397 }; 408 };
398 int status; 409 int status = -ENOMEM;
399 410
400 dprintk("NFS call remove %s\n", name->name); 411 dprintk("NFS call remove %s\n", name->name);
401 nfs_fattr_init(&res.dir_attr); 412 res.dir_attr = nfs_alloc_fattr();
413 if (res.dir_attr == NULL)
414 goto out;
415
402 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 416 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
403 nfs_post_op_update_inode(dir, &res.dir_attr); 417 nfs_post_op_update_inode(dir, res.dir_attr);
418 nfs_free_fattr(res.dir_attr);
419out:
404 dprintk("NFS reply remove: %d\n", status); 420 dprintk("NFS reply remove: %d\n", status);
405 return status; 421 return status;
406} 422}
@@ -418,7 +434,7 @@ nfs3_proc_unlink_done(struct rpc_task *task, struct inode *dir)
418 if (nfs3_async_handle_jukebox(task, dir)) 434 if (nfs3_async_handle_jukebox(task, dir))
419 return 0; 435 return 0;
420 res = task->tk_msg.rpc_resp; 436 res = task->tk_msg.rpc_resp;
421 nfs_post_op_update_inode(dir, &res->dir_attr); 437 nfs_post_op_update_inode(dir, res->dir_attr);
422 return 1; 438 return 1;
423} 439}
424 440
@@ -426,7 +442,6 @@ static int
426nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name, 442nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name,
427 struct inode *new_dir, struct qstr *new_name) 443 struct inode *new_dir, struct qstr *new_name)
428{ 444{
429 struct nfs_fattr old_dir_attr, new_dir_attr;
430 struct nfs3_renameargs arg = { 445 struct nfs3_renameargs arg = {
431 .fromfh = NFS_FH(old_dir), 446 .fromfh = NFS_FH(old_dir),
432 .fromname = old_name->name, 447 .fromname = old_name->name,
@@ -435,23 +450,27 @@ nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name,
435 .toname = new_name->name, 450 .toname = new_name->name,
436 .tolen = new_name->len 451 .tolen = new_name->len
437 }; 452 };
438 struct nfs3_renameres res = { 453 struct nfs3_renameres res;
439 .fromattr = &old_dir_attr,
440 .toattr = &new_dir_attr
441 };
442 struct rpc_message msg = { 454 struct rpc_message msg = {
443 .rpc_proc = &nfs3_procedures[NFS3PROC_RENAME], 455 .rpc_proc = &nfs3_procedures[NFS3PROC_RENAME],
444 .rpc_argp = &arg, 456 .rpc_argp = &arg,
445 .rpc_resp = &res, 457 .rpc_resp = &res,
446 }; 458 };
447 int status; 459 int status = -ENOMEM;
448 460
449 dprintk("NFS call rename %s -> %s\n", old_name->name, new_name->name); 461 dprintk("NFS call rename %s -> %s\n", old_name->name, new_name->name);
450 nfs_fattr_init(&old_dir_attr); 462
451 nfs_fattr_init(&new_dir_attr); 463 res.fromattr = nfs_alloc_fattr();
464 res.toattr = nfs_alloc_fattr();
465 if (res.fromattr == NULL || res.toattr == NULL)
466 goto out;
467
452 status = rpc_call_sync(NFS_CLIENT(old_dir), &msg, 0); 468 status = rpc_call_sync(NFS_CLIENT(old_dir), &msg, 0);
453 nfs_post_op_update_inode(old_dir, &old_dir_attr); 469 nfs_post_op_update_inode(old_dir, res.fromattr);
454 nfs_post_op_update_inode(new_dir, &new_dir_attr); 470 nfs_post_op_update_inode(new_dir, res.toattr);
471out:
472 nfs_free_fattr(res.toattr);
473 nfs_free_fattr(res.fromattr);
455 dprintk("NFS reply rename: %d\n", status); 474 dprintk("NFS reply rename: %d\n", status);
456 return status; 475 return status;
457} 476}
@@ -459,30 +478,32 @@ nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name,
459static int 478static int
460nfs3_proc_link(struct inode *inode, struct inode *dir, struct qstr *name) 479nfs3_proc_link(struct inode *inode, struct inode *dir, struct qstr *name)
461{ 480{
462 struct nfs_fattr dir_attr, fattr;
463 struct nfs3_linkargs arg = { 481 struct nfs3_linkargs arg = {
464 .fromfh = NFS_FH(inode), 482 .fromfh = NFS_FH(inode),
465 .tofh = NFS_FH(dir), 483 .tofh = NFS_FH(dir),
466 .toname = name->name, 484 .toname = name->name,
467 .tolen = name->len 485 .tolen = name->len
468 }; 486 };
469 struct nfs3_linkres res = { 487 struct nfs3_linkres res;
470 .dir_attr = &dir_attr,
471 .fattr = &fattr
472 };
473 struct rpc_message msg = { 488 struct rpc_message msg = {
474 .rpc_proc = &nfs3_procedures[NFS3PROC_LINK], 489 .rpc_proc = &nfs3_procedures[NFS3PROC_LINK],
475 .rpc_argp = &arg, 490 .rpc_argp = &arg,
476 .rpc_resp = &res, 491 .rpc_resp = &res,
477 }; 492 };
478 int status; 493 int status = -ENOMEM;
479 494
480 dprintk("NFS call link %s\n", name->name); 495 dprintk("NFS call link %s\n", name->name);
481 nfs_fattr_init(&dir_attr); 496 res.fattr = nfs_alloc_fattr();
482 nfs_fattr_init(&fattr); 497 res.dir_attr = nfs_alloc_fattr();
498 if (res.fattr == NULL || res.dir_attr == NULL)
499 goto out;
500
483 status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); 501 status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
484 nfs_post_op_update_inode(dir, &dir_attr); 502 nfs_post_op_update_inode(dir, res.dir_attr);
485 nfs_post_op_update_inode(inode, &fattr); 503 nfs_post_op_update_inode(inode, res.fattr);
504out:
505 nfs_free_fattr(res.dir_attr);
506 nfs_free_fattr(res.fattr);
486 dprintk("NFS reply link: %d\n", status); 507 dprintk("NFS reply link: %d\n", status);
487 return status; 508 return status;
488} 509}
@@ -553,7 +574,7 @@ out:
553static int 574static int
554nfs3_proc_rmdir(struct inode *dir, struct qstr *name) 575nfs3_proc_rmdir(struct inode *dir, struct qstr *name)
555{ 576{
556 struct nfs_fattr dir_attr; 577 struct nfs_fattr *dir_attr;
557 struct nfs3_diropargs arg = { 578 struct nfs3_diropargs arg = {
558 .fh = NFS_FH(dir), 579 .fh = NFS_FH(dir),
559 .name = name->name, 580 .name = name->name,
@@ -562,14 +583,19 @@ nfs3_proc_rmdir(struct inode *dir, struct qstr *name)
562 struct rpc_message msg = { 583 struct rpc_message msg = {
563 .rpc_proc = &nfs3_procedures[NFS3PROC_RMDIR], 584 .rpc_proc = &nfs3_procedures[NFS3PROC_RMDIR],
564 .rpc_argp = &arg, 585 .rpc_argp = &arg,
565 .rpc_resp = &dir_attr,
566 }; 586 };
567 int status; 587 int status = -ENOMEM;
568 588
569 dprintk("NFS call rmdir %s\n", name->name); 589 dprintk("NFS call rmdir %s\n", name->name);
570 nfs_fattr_init(&dir_attr); 590 dir_attr = nfs_alloc_fattr();
591 if (dir_attr == NULL)
592 goto out;
593
594 msg.rpc_resp = dir_attr;
571 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 595 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
572 nfs_post_op_update_inode(dir, &dir_attr); 596 nfs_post_op_update_inode(dir, dir_attr);
597 nfs_free_fattr(dir_attr);
598out:
573 dprintk("NFS reply rmdir: %d\n", status); 599 dprintk("NFS reply rmdir: %d\n", status);
574 return status; 600 return status;
575} 601}
@@ -588,7 +614,6 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
588 u64 cookie, struct page *page, unsigned int count, int plus) 614 u64 cookie, struct page *page, unsigned int count, int plus)
589{ 615{
590 struct inode *dir = dentry->d_inode; 616 struct inode *dir = dentry->d_inode;
591 struct nfs_fattr dir_attr;
592 __be32 *verf = NFS_COOKIEVERF(dir); 617 __be32 *verf = NFS_COOKIEVERF(dir);
593 struct nfs3_readdirargs arg = { 618 struct nfs3_readdirargs arg = {
594 .fh = NFS_FH(dir), 619 .fh = NFS_FH(dir),
@@ -599,7 +624,6 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
599 .pages = &page 624 .pages = &page
600 }; 625 };
601 struct nfs3_readdirres res = { 626 struct nfs3_readdirres res = {
602 .dir_attr = &dir_attr,
603 .verf = verf, 627 .verf = verf,
604 .plus = plus 628 .plus = plus
605 }; 629 };
@@ -609,7 +633,7 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
609 .rpc_resp = &res, 633 .rpc_resp = &res,
610 .rpc_cred = cred 634 .rpc_cred = cred
611 }; 635 };
612 int status; 636 int status = -ENOMEM;
613 637
614 if (plus) 638 if (plus)
615 msg.rpc_proc = &nfs3_procedures[NFS3PROC_READDIRPLUS]; 639 msg.rpc_proc = &nfs3_procedures[NFS3PROC_READDIRPLUS];
@@ -617,12 +641,17 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
617 dprintk("NFS call readdir%s %d\n", 641 dprintk("NFS call readdir%s %d\n",
618 plus? "plus" : "", (unsigned int) cookie); 642 plus? "plus" : "", (unsigned int) cookie);
619 643
620 nfs_fattr_init(&dir_attr); 644 res.dir_attr = nfs_alloc_fattr();
645 if (res.dir_attr == NULL)
646 goto out;
647
621 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 648 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
622 649
623 nfs_invalidate_atime(dir); 650 nfs_invalidate_atime(dir);
651 nfs_refresh_inode(dir, res.dir_attr);
624 652
625 nfs_refresh_inode(dir, &dir_attr); 653 nfs_free_fattr(res.dir_attr);
654out:
626 dprintk("NFS reply readdir: %d\n", status); 655 dprintk("NFS reply readdir: %d\n", status);
627 return status; 656 return status;
628} 657}
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 5fe5492fbd29..75dcfc7da365 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -9,7 +9,6 @@
9#include <linux/param.h> 9#include <linux/param.h>
10#include <linux/time.h> 10#include <linux/time.h>
11#include <linux/mm.h> 11#include <linux/mm.h>
12#include <linux/slab.h>
13#include <linux/errno.h> 12#include <linux/errno.h>
14#include <linux/string.h> 13#include <linux/string.h>
15#include <linux/in.h> 14#include <linux/in.h>
@@ -763,7 +762,7 @@ nfs3_xdr_wccstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
763static int 762static int
764nfs3_xdr_removeres(struct rpc_rqst *req, __be32 *p, struct nfs_removeres *res) 763nfs3_xdr_removeres(struct rpc_rqst *req, __be32 *p, struct nfs_removeres *res)
765{ 764{
766 return nfs3_xdr_wccstat(req, p, &res->dir_attr); 765 return nfs3_xdr_wccstat(req, p, res->dir_attr);
767} 766}
768 767
769/* 768/*
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index a187200a7aac..c538c6106e16 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -206,14 +206,14 @@ extern ssize_t nfs4_listxattr(struct dentry *, char *, size_t);
206 206
207 207
208/* nfs4proc.c */ 208/* nfs4proc.c */
209extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *); 209extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *);
210extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct rpc_cred *); 210extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *);
211extern int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred); 211extern int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred);
212extern int nfs4_proc_async_renew(struct nfs_client *, struct rpc_cred *); 212extern int nfs4_proc_async_renew(struct nfs_client *, struct rpc_cred *);
213extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *); 213extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *);
214extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *); 214extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *);
215extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *); 215extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *);
216extern int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait); 216extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait);
217extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *); 217extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *);
218extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *); 218extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *);
219extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle); 219extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
@@ -286,7 +286,7 @@ extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
286extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); 286extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl);
287extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t); 287extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t);
288 288
289extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter); 289extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask);
290extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task); 290extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task);
291extern void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid); 291extern void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid);
292extern void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid); 292extern void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid);
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index fa3408f20112..3c2a1724fbd2 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -11,6 +11,7 @@
11#include <linux/mount.h> 11#include <linux/mount.h>
12#include <linux/namei.h> 12#include <linux/namei.h>
13#include <linux/nfs_fs.h> 13#include <linux/nfs_fs.h>
14#include <linux/slab.h>
14#include <linux/string.h> 15#include <linux/string.h>
15#include <linux/sunrpc/clnt.h> 16#include <linux/sunrpc/clnt.h>
16#include <linux/vfs.h> 17#include <linux/vfs.h>
@@ -114,6 +115,7 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
114 char *page, char *page2, 115 char *page, char *page2,
115 const struct nfs4_fs_location *location) 116 const struct nfs4_fs_location *location)
116{ 117{
118 const size_t addr_bufsize = sizeof(struct sockaddr_storage);
117 struct vfsmount *mnt = ERR_PTR(-ENOENT); 119 struct vfsmount *mnt = ERR_PTR(-ENOENT);
118 char *mnt_path; 120 char *mnt_path;
119 unsigned int maxbuflen; 121 unsigned int maxbuflen;
@@ -125,9 +127,12 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
125 mountdata->mnt_path = mnt_path; 127 mountdata->mnt_path = mnt_path;
126 maxbuflen = mnt_path - 1 - page2; 128 maxbuflen = mnt_path - 1 - page2;
127 129
130 mountdata->addr = kmalloc(addr_bufsize, GFP_KERNEL);
131 if (mountdata->addr == NULL)
132 return ERR_PTR(-ENOMEM);
133
128 for (s = 0; s < location->nservers; s++) { 134 for (s = 0; s < location->nservers; s++) {
129 const struct nfs4_string *buf = &location->servers[s]; 135 const struct nfs4_string *buf = &location->servers[s];
130 struct sockaddr_storage addr;
131 136
132 if (buf->len <= 0 || buf->len >= maxbuflen) 137 if (buf->len <= 0 || buf->len >= maxbuflen)
133 continue; 138 continue;
@@ -136,11 +141,10 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
136 continue; 141 continue;
137 142
138 mountdata->addrlen = nfs_parse_server_name(buf->data, buf->len, 143 mountdata->addrlen = nfs_parse_server_name(buf->data, buf->len,
139 (struct sockaddr *)&addr, sizeof(addr)); 144 mountdata->addr, addr_bufsize);
140 if (mountdata->addrlen == 0) 145 if (mountdata->addrlen == 0)
141 continue; 146 continue;
142 147
143 mountdata->addr = (struct sockaddr *)&addr;
144 rpc_set_port(mountdata->addr, NFS_PORT); 148 rpc_set_port(mountdata->addr, NFS_PORT);
145 149
146 memcpy(page2, buf->data, buf->len); 150 memcpy(page2, buf->data, buf->len);
@@ -155,6 +159,7 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
155 if (!IS_ERR(mnt)) 159 if (!IS_ERR(mnt))
156 break; 160 break;
157 } 161 }
162 kfree(mountdata->addr);
158 return mnt; 163 return mnt;
159} 164}
160 165
@@ -220,8 +225,8 @@ out:
220 225
221/* 226/*
222 * nfs_do_refmount - handle crossing a referral on server 227 * nfs_do_refmount - handle crossing a referral on server
228 * @mnt_parent - mountpoint of referral
223 * @dentry - dentry of referral 229 * @dentry - dentry of referral
224 * @nd - nameidata info
225 * 230 *
226 */ 231 */
227struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry) 232struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index f9254fb0c9d0..70015dd60a98 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -39,6 +39,7 @@
39#include <linux/delay.h> 39#include <linux/delay.h>
40#include <linux/errno.h> 40#include <linux/errno.h>
41#include <linux/string.h> 41#include <linux/string.h>
42#include <linux/slab.h>
42#include <linux/sunrpc/clnt.h> 43#include <linux/sunrpc/clnt.h>
43#include <linux/nfs.h> 44#include <linux/nfs.h>
44#include <linux/nfs4.h> 45#include <linux/nfs4.h>
@@ -69,6 +70,9 @@ static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinf
69static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *); 70static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *);
70static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr); 71static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
71static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr); 72static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
73static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
74 struct nfs_fattr *fattr, struct iattr *sattr,
75 struct nfs4_state *state);
72 76
73/* Prevent leaks of NFSv4 errors into userland */ 77/* Prevent leaks of NFSv4 errors into userland */
74static int nfs4_map_errors(int err) 78static int nfs4_map_errors(int err)
@@ -713,17 +717,18 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p)
713 717
714static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path, 718static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path,
715 struct nfs4_state_owner *sp, fmode_t fmode, int flags, 719 struct nfs4_state_owner *sp, fmode_t fmode, int flags,
716 const struct iattr *attrs) 720 const struct iattr *attrs,
721 gfp_t gfp_mask)
717{ 722{
718 struct dentry *parent = dget_parent(path->dentry); 723 struct dentry *parent = dget_parent(path->dentry);
719 struct inode *dir = parent->d_inode; 724 struct inode *dir = parent->d_inode;
720 struct nfs_server *server = NFS_SERVER(dir); 725 struct nfs_server *server = NFS_SERVER(dir);
721 struct nfs4_opendata *p; 726 struct nfs4_opendata *p;
722 727
723 p = kzalloc(sizeof(*p), GFP_KERNEL); 728 p = kzalloc(sizeof(*p), gfp_mask);
724 if (p == NULL) 729 if (p == NULL)
725 goto err; 730 goto err;
726 p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid); 731 p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid, gfp_mask);
727 if (p->o_arg.seqid == NULL) 732 if (p->o_arg.seqid == NULL)
728 goto err_free; 733 goto err_free;
729 path_get(path); 734 path_get(path);
@@ -1059,7 +1064,7 @@ static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context
1059{ 1064{
1060 struct nfs4_opendata *opendata; 1065 struct nfs4_opendata *opendata;
1061 1066
1062 opendata = nfs4_opendata_alloc(&ctx->path, state->owner, 0, 0, NULL); 1067 opendata = nfs4_opendata_alloc(&ctx->path, state->owner, 0, 0, NULL, GFP_NOFS);
1063 if (opendata == NULL) 1068 if (opendata == NULL)
1064 return ERR_PTR(-ENOMEM); 1069 return ERR_PTR(-ENOMEM);
1065 opendata->state = state; 1070 opendata->state = state;
@@ -1522,6 +1527,8 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
1522 nfs_post_op_update_inode(dir, o_res->dir_attr); 1527 nfs_post_op_update_inode(dir, o_res->dir_attr);
1523 } else 1528 } else
1524 nfs_refresh_inode(dir, o_res->dir_attr); 1529 nfs_refresh_inode(dir, o_res->dir_attr);
1530 if ((o_res->rflags & NFS4_OPEN_RESULT_LOCKTYPE_POSIX) == 0)
1531 server->caps &= ~NFS_CAP_POSIX_LOCK;
1525 if(o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) { 1532 if(o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) {
1526 status = _nfs4_proc_open_confirm(data); 1533 status = _nfs4_proc_open_confirm(data);
1527 if (status != 0) 1534 if (status != 0)
@@ -1645,7 +1652,7 @@ static int _nfs4_do_open(struct inode *dir, struct path *path, fmode_t fmode, in
1645 if (path->dentry->d_inode != NULL) 1652 if (path->dentry->d_inode != NULL)
1646 nfs4_return_incompatible_delegation(path->dentry->d_inode, fmode); 1653 nfs4_return_incompatible_delegation(path->dentry->d_inode, fmode);
1647 status = -ENOMEM; 1654 status = -ENOMEM;
1648 opendata = nfs4_opendata_alloc(path, sp, fmode, flags, sattr); 1655 opendata = nfs4_opendata_alloc(path, sp, fmode, flags, sattr, GFP_KERNEL);
1649 if (opendata == NULL) 1656 if (opendata == NULL)
1650 goto err_put_state_owner; 1657 goto err_put_state_owner;
1651 1658
@@ -1656,15 +1663,24 @@ static int _nfs4_do_open(struct inode *dir, struct path *path, fmode_t fmode, in
1656 if (status != 0) 1663 if (status != 0)
1657 goto err_opendata_put; 1664 goto err_opendata_put;
1658 1665
1659 if (opendata->o_arg.open_flags & O_EXCL)
1660 nfs4_exclusive_attrset(opendata, sattr);
1661
1662 state = nfs4_opendata_to_nfs4_state(opendata); 1666 state = nfs4_opendata_to_nfs4_state(opendata);
1663 status = PTR_ERR(state); 1667 status = PTR_ERR(state);
1664 if (IS_ERR(state)) 1668 if (IS_ERR(state))
1665 goto err_opendata_put; 1669 goto err_opendata_put;
1666 if ((opendata->o_res.rflags & NFS4_OPEN_RESULT_LOCKTYPE_POSIX) != 0) 1670 if (server->caps & NFS_CAP_POSIX_LOCK)
1667 set_bit(NFS_STATE_POSIX_LOCKS, &state->flags); 1671 set_bit(NFS_STATE_POSIX_LOCKS, &state->flags);
1672
1673 if (opendata->o_arg.open_flags & O_EXCL) {
1674 nfs4_exclusive_attrset(opendata, sattr);
1675
1676 nfs_fattr_init(opendata->o_res.f_attr);
1677 status = nfs4_do_setattr(state->inode, cred,
1678 opendata->o_res.f_attr, sattr,
1679 state);
1680 if (status == 0)
1681 nfs_setattr_update_inode(state->inode, sattr);
1682 nfs_post_op_update_inode(state->inode, opendata->o_res.f_attr);
1683 }
1668 nfs4_opendata_put(opendata); 1684 nfs4_opendata_put(opendata);
1669 nfs4_put_state_owner(sp); 1685 nfs4_put_state_owner(sp);
1670 *res = state; 1686 *res = state;
@@ -1911,7 +1927,7 @@ static const struct rpc_call_ops nfs4_close_ops = {
1911 * 1927 *
1912 * NOTE: Caller must be holding the sp->so_owner semaphore! 1928 * NOTE: Caller must be holding the sp->so_owner semaphore!
1913 */ 1929 */
1914int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait) 1930int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait)
1915{ 1931{
1916 struct nfs_server *server = NFS_SERVER(state->inode); 1932 struct nfs_server *server = NFS_SERVER(state->inode);
1917 struct nfs4_closedata *calldata; 1933 struct nfs4_closedata *calldata;
@@ -1930,7 +1946,7 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
1930 }; 1946 };
1931 int status = -ENOMEM; 1947 int status = -ENOMEM;
1932 1948
1933 calldata = kzalloc(sizeof(*calldata), GFP_KERNEL); 1949 calldata = kzalloc(sizeof(*calldata), gfp_mask);
1934 if (calldata == NULL) 1950 if (calldata == NULL)
1935 goto out; 1951 goto out;
1936 calldata->inode = state->inode; 1952 calldata->inode = state->inode;
@@ -1938,7 +1954,7 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
1938 calldata->arg.fh = NFS_FH(state->inode); 1954 calldata->arg.fh = NFS_FH(state->inode);
1939 calldata->arg.stateid = &state->open_stateid; 1955 calldata->arg.stateid = &state->open_stateid;
1940 /* Serialization for the sequence id */ 1956 /* Serialization for the sequence id */
1941 calldata->arg.seqid = nfs_alloc_seqid(&state->owner->so_seqid); 1957 calldata->arg.seqid = nfs_alloc_seqid(&state->owner->so_seqid, gfp_mask);
1942 if (calldata->arg.seqid == NULL) 1958 if (calldata->arg.seqid == NULL)
1943 goto out_free_calldata; 1959 goto out_free_calldata;
1944 calldata->arg.fmode = 0; 1960 calldata->arg.fmode = 0;
@@ -2067,8 +2083,7 @@ nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, st
2067 case -EDQUOT: 2083 case -EDQUOT:
2068 case -ENOSPC: 2084 case -ENOSPC:
2069 case -EROFS: 2085 case -EROFS:
2070 lookup_instantiate_filp(nd, (struct dentry *)state, NULL); 2086 return PTR_ERR(state);
2071 return 1;
2072 default: 2087 default:
2073 goto out_drop; 2088 goto out_drop;
2074 } 2089 }
@@ -2402,14 +2417,12 @@ static int nfs4_proc_lookup(struct inode *dir, struct qstr *name, struct nfs_fh
2402static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry) 2417static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry)
2403{ 2418{
2404 struct nfs_server *server = NFS_SERVER(inode); 2419 struct nfs_server *server = NFS_SERVER(inode);
2405 struct nfs_fattr fattr;
2406 struct nfs4_accessargs args = { 2420 struct nfs4_accessargs args = {
2407 .fh = NFS_FH(inode), 2421 .fh = NFS_FH(inode),
2408 .bitmask = server->attr_bitmask, 2422 .bitmask = server->attr_bitmask,
2409 }; 2423 };
2410 struct nfs4_accessres res = { 2424 struct nfs4_accessres res = {
2411 .server = server, 2425 .server = server,
2412 .fattr = &fattr,
2413 }; 2426 };
2414 struct rpc_message msg = { 2427 struct rpc_message msg = {
2415 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ACCESS], 2428 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ACCESS],
@@ -2436,7 +2449,11 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry
2436 if (mode & MAY_EXEC) 2449 if (mode & MAY_EXEC)
2437 args.access |= NFS4_ACCESS_EXECUTE; 2450 args.access |= NFS4_ACCESS_EXECUTE;
2438 } 2451 }
2439 nfs_fattr_init(&fattr); 2452
2453 res.fattr = nfs_alloc_fattr();
2454 if (res.fattr == NULL)
2455 return -ENOMEM;
2456
2440 status = nfs4_call_sync(server, &msg, &args, &res, 0); 2457 status = nfs4_call_sync(server, &msg, &args, &res, 0);
2441 if (!status) { 2458 if (!status) {
2442 entry->mask = 0; 2459 entry->mask = 0;
@@ -2446,8 +2463,9 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry
2446 entry->mask |= MAY_WRITE; 2463 entry->mask |= MAY_WRITE;
2447 if (res.access & (NFS4_ACCESS_LOOKUP|NFS4_ACCESS_EXECUTE)) 2464 if (res.access & (NFS4_ACCESS_LOOKUP|NFS4_ACCESS_EXECUTE))
2448 entry->mask |= MAY_EXEC; 2465 entry->mask |= MAY_EXEC;
2449 nfs_refresh_inode(inode, &fattr); 2466 nfs_refresh_inode(inode, res.fattr);
2450 } 2467 }
2468 nfs_free_fattr(res.fattr);
2451 return status; 2469 return status;
2452} 2470}
2453 2471
@@ -2560,13 +2578,6 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
2560 } 2578 }
2561 d_add(dentry, igrab(state->inode)); 2579 d_add(dentry, igrab(state->inode));
2562 nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); 2580 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
2563 if (flags & O_EXCL) {
2564 struct nfs_fattr fattr;
2565 status = nfs4_do_setattr(state->inode, cred, &fattr, sattr, state);
2566 if (status == 0)
2567 nfs_setattr_update_inode(state->inode, sattr);
2568 nfs_post_op_update_inode(state->inode, &fattr);
2569 }
2570 if (status == 0 && (nd->flags & LOOKUP_OPEN) != 0) 2581 if (status == 0 && (nd->flags & LOOKUP_OPEN) != 0)
2571 status = nfs4_intent_set_file(nd, &path, state, fmode); 2582 status = nfs4_intent_set_file(nd, &path, state, fmode);
2572 else 2583 else
@@ -2594,14 +2605,19 @@ static int _nfs4_proc_remove(struct inode *dir, struct qstr *name)
2594 .rpc_argp = &args, 2605 .rpc_argp = &args,
2595 .rpc_resp = &res, 2606 .rpc_resp = &res,
2596 }; 2607 };
2597 int status; 2608 int status = -ENOMEM;
2609
2610 res.dir_attr = nfs_alloc_fattr();
2611 if (res.dir_attr == NULL)
2612 goto out;
2598 2613
2599 nfs_fattr_init(&res.dir_attr);
2600 status = nfs4_call_sync(server, &msg, &args, &res, 1); 2614 status = nfs4_call_sync(server, &msg, &args, &res, 1);
2601 if (status == 0) { 2615 if (status == 0) {
2602 update_changeattr(dir, &res.cinfo); 2616 update_changeattr(dir, &res.cinfo);
2603 nfs_post_op_update_inode(dir, &res.dir_attr); 2617 nfs_post_op_update_inode(dir, res.dir_attr);
2604 } 2618 }
2619 nfs_free_fattr(res.dir_attr);
2620out:
2605 return status; 2621 return status;
2606} 2622}
2607 2623
@@ -2636,7 +2652,7 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
2636 if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN) 2652 if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN)
2637 return 0; 2653 return 0;
2638 update_changeattr(dir, &res->cinfo); 2654 update_changeattr(dir, &res->cinfo);
2639 nfs_post_op_update_inode(dir, &res->dir_attr); 2655 nfs_post_op_update_inode(dir, res->dir_attr);
2640 return 1; 2656 return 1;
2641} 2657}
2642 2658
@@ -2651,29 +2667,31 @@ static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
2651 .new_name = new_name, 2667 .new_name = new_name,
2652 .bitmask = server->attr_bitmask, 2668 .bitmask = server->attr_bitmask,
2653 }; 2669 };
2654 struct nfs_fattr old_fattr, new_fattr;
2655 struct nfs4_rename_res res = { 2670 struct nfs4_rename_res res = {
2656 .server = server, 2671 .server = server,
2657 .old_fattr = &old_fattr,
2658 .new_fattr = &new_fattr,
2659 }; 2672 };
2660 struct rpc_message msg = { 2673 struct rpc_message msg = {
2661 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME], 2674 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME],
2662 .rpc_argp = &arg, 2675 .rpc_argp = &arg,
2663 .rpc_resp = &res, 2676 .rpc_resp = &res,
2664 }; 2677 };
2665 int status; 2678 int status = -ENOMEM;
2666 2679
2667 nfs_fattr_init(res.old_fattr); 2680 res.old_fattr = nfs_alloc_fattr();
2668 nfs_fattr_init(res.new_fattr); 2681 res.new_fattr = nfs_alloc_fattr();
2669 status = nfs4_call_sync(server, &msg, &arg, &res, 1); 2682 if (res.old_fattr == NULL || res.new_fattr == NULL)
2683 goto out;
2670 2684
2685 status = nfs4_call_sync(server, &msg, &arg, &res, 1);
2671 if (!status) { 2686 if (!status) {
2672 update_changeattr(old_dir, &res.old_cinfo); 2687 update_changeattr(old_dir, &res.old_cinfo);
2673 nfs_post_op_update_inode(old_dir, res.old_fattr); 2688 nfs_post_op_update_inode(old_dir, res.old_fattr);
2674 update_changeattr(new_dir, &res.new_cinfo); 2689 update_changeattr(new_dir, &res.new_cinfo);
2675 nfs_post_op_update_inode(new_dir, res.new_fattr); 2690 nfs_post_op_update_inode(new_dir, res.new_fattr);
2676 } 2691 }
2692out:
2693 nfs_free_fattr(res.new_fattr);
2694 nfs_free_fattr(res.old_fattr);
2677 return status; 2695 return status;
2678} 2696}
2679 2697
@@ -2700,28 +2718,30 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *
2700 .name = name, 2718 .name = name,
2701 .bitmask = server->attr_bitmask, 2719 .bitmask = server->attr_bitmask,
2702 }; 2720 };
2703 struct nfs_fattr fattr, dir_attr;
2704 struct nfs4_link_res res = { 2721 struct nfs4_link_res res = {
2705 .server = server, 2722 .server = server,
2706 .fattr = &fattr,
2707 .dir_attr = &dir_attr,
2708 }; 2723 };
2709 struct rpc_message msg = { 2724 struct rpc_message msg = {
2710 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LINK], 2725 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LINK],
2711 .rpc_argp = &arg, 2726 .rpc_argp = &arg,
2712 .rpc_resp = &res, 2727 .rpc_resp = &res,
2713 }; 2728 };
2714 int status; 2729 int status = -ENOMEM;
2730
2731 res.fattr = nfs_alloc_fattr();
2732 res.dir_attr = nfs_alloc_fattr();
2733 if (res.fattr == NULL || res.dir_attr == NULL)
2734 goto out;
2715 2735
2716 nfs_fattr_init(res.fattr);
2717 nfs_fattr_init(res.dir_attr);
2718 status = nfs4_call_sync(server, &msg, &arg, &res, 1); 2736 status = nfs4_call_sync(server, &msg, &arg, &res, 1);
2719 if (!status) { 2737 if (!status) {
2720 update_changeattr(dir, &res.cinfo); 2738 update_changeattr(dir, &res.cinfo);
2721 nfs_post_op_update_inode(dir, res.dir_attr); 2739 nfs_post_op_update_inode(dir, res.dir_attr);
2722 nfs_post_op_update_inode(inode, res.fattr); 2740 nfs_post_op_update_inode(inode, res.fattr);
2723 } 2741 }
2724 2742out:
2743 nfs_free_fattr(res.dir_attr);
2744 nfs_free_fattr(res.fattr);
2725 return status; 2745 return status;
2726} 2746}
2727 2747
@@ -3144,23 +3164,31 @@ static void nfs4_proc_commit_setup(struct nfs_write_data *data, struct rpc_messa
3144 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT]; 3164 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT];
3145} 3165}
3146 3166
3167struct nfs4_renewdata {
3168 struct nfs_client *client;
3169 unsigned long timestamp;
3170};
3171
3147/* 3172/*
3148 * nfs4_proc_async_renew(): This is not one of the nfs_rpc_ops; it is a special 3173 * nfs4_proc_async_renew(): This is not one of the nfs_rpc_ops; it is a special
3149 * standalone procedure for queueing an asynchronous RENEW. 3174 * standalone procedure for queueing an asynchronous RENEW.
3150 */ 3175 */
3151static void nfs4_renew_release(void *data) 3176static void nfs4_renew_release(void *calldata)
3152{ 3177{
3153 struct nfs_client *clp = data; 3178 struct nfs4_renewdata *data = calldata;
3179 struct nfs_client *clp = data->client;
3154 3180
3155 if (atomic_read(&clp->cl_count) > 1) 3181 if (atomic_read(&clp->cl_count) > 1)
3156 nfs4_schedule_state_renewal(clp); 3182 nfs4_schedule_state_renewal(clp);
3157 nfs_put_client(clp); 3183 nfs_put_client(clp);
3184 kfree(data);
3158} 3185}
3159 3186
3160static void nfs4_renew_done(struct rpc_task *task, void *data) 3187static void nfs4_renew_done(struct rpc_task *task, void *calldata)
3161{ 3188{
3162 struct nfs_client *clp = data; 3189 struct nfs4_renewdata *data = calldata;
3163 unsigned long timestamp = task->tk_start; 3190 struct nfs_client *clp = data->client;
3191 unsigned long timestamp = data->timestamp;
3164 3192
3165 if (task->tk_status < 0) { 3193 if (task->tk_status < 0) {
3166 /* Unless we're shutting down, schedule state recovery! */ 3194 /* Unless we're shutting down, schedule state recovery! */
@@ -3186,11 +3214,17 @@ int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred)
3186 .rpc_argp = clp, 3214 .rpc_argp = clp,
3187 .rpc_cred = cred, 3215 .rpc_cred = cred,
3188 }; 3216 };
3217 struct nfs4_renewdata *data;
3189 3218
3190 if (!atomic_inc_not_zero(&clp->cl_count)) 3219 if (!atomic_inc_not_zero(&clp->cl_count))
3191 return -EIO; 3220 return -EIO;
3221 data = kmalloc(sizeof(*data), GFP_KERNEL);
3222 if (data == NULL)
3223 return -ENOMEM;
3224 data->client = clp;
3225 data->timestamp = jiffies;
3192 return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_SOFT, 3226 return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_SOFT,
3193 &nfs4_renew_ops, clp); 3227 &nfs4_renew_ops, data);
3194} 3228}
3195 3229
3196int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred) 3230int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred)
@@ -3492,7 +3526,9 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
3492 return _nfs4_async_handle_error(task, server, server->nfs_client, state); 3526 return _nfs4_async_handle_error(task, server, server->nfs_client, state);
3493} 3527}
3494 3528
3495int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short port, struct rpc_cred *cred) 3529int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
3530 unsigned short port, struct rpc_cred *cred,
3531 struct nfs4_setclientid_res *res)
3496{ 3532{
3497 nfs4_verifier sc_verifier; 3533 nfs4_verifier sc_verifier;
3498 struct nfs4_setclientid setclientid = { 3534 struct nfs4_setclientid setclientid = {
@@ -3502,7 +3538,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short po
3502 struct rpc_message msg = { 3538 struct rpc_message msg = {
3503 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID], 3539 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID],
3504 .rpc_argp = &setclientid, 3540 .rpc_argp = &setclientid,
3505 .rpc_resp = clp, 3541 .rpc_resp = res,
3506 .rpc_cred = cred, 3542 .rpc_cred = cred,
3507 }; 3543 };
3508 __be32 *p; 3544 __be32 *p;
@@ -3545,12 +3581,14 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short po
3545 return status; 3581 return status;
3546} 3582}
3547 3583
3548static int _nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cred *cred) 3584static int _nfs4_proc_setclientid_confirm(struct nfs_client *clp,
3585 struct nfs4_setclientid_res *arg,
3586 struct rpc_cred *cred)
3549{ 3587{
3550 struct nfs_fsinfo fsinfo; 3588 struct nfs_fsinfo fsinfo;
3551 struct rpc_message msg = { 3589 struct rpc_message msg = {
3552 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID_CONFIRM], 3590 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID_CONFIRM],
3553 .rpc_argp = clp, 3591 .rpc_argp = arg,
3554 .rpc_resp = &fsinfo, 3592 .rpc_resp = &fsinfo,
3555 .rpc_cred = cred, 3593 .rpc_cred = cred,
3556 }; 3594 };
@@ -3568,12 +3606,14 @@ static int _nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cre
3568 return status; 3606 return status;
3569} 3607}
3570 3608
3571int nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cred *cred) 3609int nfs4_proc_setclientid_confirm(struct nfs_client *clp,
3610 struct nfs4_setclientid_res *arg,
3611 struct rpc_cred *cred)
3572{ 3612{
3573 long timeout = 0; 3613 long timeout = 0;
3574 int err; 3614 int err;
3575 do { 3615 do {
3576 err = _nfs4_proc_setclientid_confirm(clp, cred); 3616 err = _nfs4_proc_setclientid_confirm(clp, arg, cred);
3577 switch (err) { 3617 switch (err) {
3578 case 0: 3618 case 0:
3579 return err; 3619 return err;
@@ -3665,7 +3705,7 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
3665 }; 3705 };
3666 int status = 0; 3706 int status = 0;
3667 3707
3668 data = kzalloc(sizeof(*data), GFP_KERNEL); 3708 data = kzalloc(sizeof(*data), GFP_NOFS);
3669 if (data == NULL) 3709 if (data == NULL)
3670 return -ENOMEM; 3710 return -ENOMEM;
3671 data->args.fhandle = &data->fh; 3711 data->args.fhandle = &data->fh;
@@ -3821,7 +3861,7 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
3821 struct nfs4_unlockdata *p; 3861 struct nfs4_unlockdata *p;
3822 struct inode *inode = lsp->ls_state->inode; 3862 struct inode *inode = lsp->ls_state->inode;
3823 3863
3824 p = kzalloc(sizeof(*p), GFP_KERNEL); 3864 p = kzalloc(sizeof(*p), GFP_NOFS);
3825 if (p == NULL) 3865 if (p == NULL)
3826 return NULL; 3866 return NULL;
3827 p->arg.fh = NFS_FH(inode); 3867 p->arg.fh = NFS_FH(inode);
@@ -3959,7 +3999,7 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
3959 if (test_bit(NFS_DELEGATED_STATE, &state->flags)) 3999 if (test_bit(NFS_DELEGATED_STATE, &state->flags))
3960 goto out; 4000 goto out;
3961 lsp = request->fl_u.nfs4_fl.owner; 4001 lsp = request->fl_u.nfs4_fl.owner;
3962 seqid = nfs_alloc_seqid(&lsp->ls_seqid); 4002 seqid = nfs_alloc_seqid(&lsp->ls_seqid, GFP_KERNEL);
3963 status = -ENOMEM; 4003 status = -ENOMEM;
3964 if (seqid == NULL) 4004 if (seqid == NULL)
3965 goto out; 4005 goto out;
@@ -3987,22 +4027,23 @@ struct nfs4_lockdata {
3987}; 4027};
3988 4028
3989static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl, 4029static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
3990 struct nfs_open_context *ctx, struct nfs4_lock_state *lsp) 4030 struct nfs_open_context *ctx, struct nfs4_lock_state *lsp,
4031 gfp_t gfp_mask)
3991{ 4032{
3992 struct nfs4_lockdata *p; 4033 struct nfs4_lockdata *p;
3993 struct inode *inode = lsp->ls_state->inode; 4034 struct inode *inode = lsp->ls_state->inode;
3994 struct nfs_server *server = NFS_SERVER(inode); 4035 struct nfs_server *server = NFS_SERVER(inode);
3995 4036
3996 p = kzalloc(sizeof(*p), GFP_KERNEL); 4037 p = kzalloc(sizeof(*p), gfp_mask);
3997 if (p == NULL) 4038 if (p == NULL)
3998 return NULL; 4039 return NULL;
3999 4040
4000 p->arg.fh = NFS_FH(inode); 4041 p->arg.fh = NFS_FH(inode);
4001 p->arg.fl = &p->fl; 4042 p->arg.fl = &p->fl;
4002 p->arg.open_seqid = nfs_alloc_seqid(&lsp->ls_state->owner->so_seqid); 4043 p->arg.open_seqid = nfs_alloc_seqid(&lsp->ls_state->owner->so_seqid, gfp_mask);
4003 if (p->arg.open_seqid == NULL) 4044 if (p->arg.open_seqid == NULL)
4004 goto out_free; 4045 goto out_free;
4005 p->arg.lock_seqid = nfs_alloc_seqid(&lsp->ls_seqid); 4046 p->arg.lock_seqid = nfs_alloc_seqid(&lsp->ls_seqid, gfp_mask);
4006 if (p->arg.lock_seqid == NULL) 4047 if (p->arg.lock_seqid == NULL)
4007 goto out_free_seqid; 4048 goto out_free_seqid;
4008 p->arg.lock_stateid = &lsp->ls_stateid; 4049 p->arg.lock_stateid = &lsp->ls_stateid;
@@ -4156,7 +4197,8 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
4156 4197
4157 dprintk("%s: begin!\n", __func__); 4198 dprintk("%s: begin!\n", __func__);
4158 data = nfs4_alloc_lockdata(fl, nfs_file_open_context(fl->fl_file), 4199 data = nfs4_alloc_lockdata(fl, nfs_file_open_context(fl->fl_file),
4159 fl->fl_u.nfs4_fl.owner); 4200 fl->fl_u.nfs4_fl.owner,
4201 recovery_type == NFS_LOCK_NEW ? GFP_KERNEL : GFP_NOFS);
4160 if (data == NULL) 4202 if (data == NULL)
4161 return -ENOMEM; 4203 return -ENOMEM;
4162 if (IS_SETLKW(cmd)) 4204 if (IS_SETLKW(cmd))
@@ -4645,7 +4687,7 @@ static int nfs4_reset_slot_table(struct nfs4_slot_table *tbl, u32 max_reqs,
4645 if (max_reqs != tbl->max_slots) { 4687 if (max_reqs != tbl->max_slots) {
4646 ret = -ENOMEM; 4688 ret = -ENOMEM;
4647 new = kmalloc(max_reqs * sizeof(struct nfs4_slot), 4689 new = kmalloc(max_reqs * sizeof(struct nfs4_slot),
4648 GFP_KERNEL); 4690 GFP_NOFS);
4649 if (!new) 4691 if (!new)
4650 goto out; 4692 goto out;
4651 ret = 0; 4693 ret = 0;
@@ -4710,7 +4752,7 @@ static int nfs4_init_slot_table(struct nfs4_slot_table *tbl,
4710 4752
4711 dprintk("--> %s: max_reqs=%u\n", __func__, max_slots); 4753 dprintk("--> %s: max_reqs=%u\n", __func__, max_slots);
4712 4754
4713 slot = kcalloc(max_slots, sizeof(struct nfs4_slot), GFP_KERNEL); 4755 slot = kcalloc(max_slots, sizeof(struct nfs4_slot), GFP_NOFS);
4714 if (!slot) 4756 if (!slot)
4715 goto out; 4757 goto out;
4716 ret = 0; 4758 ret = 0;
@@ -4759,7 +4801,7 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
4759 struct nfs4_session *session; 4801 struct nfs4_session *session;
4760 struct nfs4_slot_table *tbl; 4802 struct nfs4_slot_table *tbl;
4761 4803
4762 session = kzalloc(sizeof(struct nfs4_session), GFP_KERNEL); 4804 session = kzalloc(sizeof(struct nfs4_session), GFP_NOFS);
4763 if (!session) 4805 if (!session)
4764 return NULL; 4806 return NULL;
4765 4807
@@ -5103,8 +5145,8 @@ static int nfs41_proc_async_sequence(struct nfs_client *clp,
5103 5145
5104 if (!atomic_inc_not_zero(&clp->cl_count)) 5146 if (!atomic_inc_not_zero(&clp->cl_count))
5105 return -EIO; 5147 return -EIO;
5106 args = kzalloc(sizeof(*args), GFP_KERNEL); 5148 args = kzalloc(sizeof(*args), GFP_NOFS);
5107 res = kzalloc(sizeof(*res), GFP_KERNEL); 5149 res = kzalloc(sizeof(*res), GFP_NOFS);
5108 if (!args || !res) { 5150 if (!args || !res) {
5109 kfree(args); 5151 kfree(args);
5110 kfree(res); 5152 kfree(res);
@@ -5205,7 +5247,7 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp)
5205 int status = -ENOMEM; 5247 int status = -ENOMEM;
5206 5248
5207 dprintk("--> %s\n", __func__); 5249 dprintk("--> %s\n", __func__);
5208 calldata = kzalloc(sizeof(*calldata), GFP_KERNEL); 5250 calldata = kzalloc(sizeof(*calldata), GFP_NOFS);
5209 if (calldata == NULL) 5251 if (calldata == NULL)
5210 goto out; 5252 goto out;
5211 calldata->clp = clp; 5253 calldata->clp = clp;
@@ -5216,9 +5258,12 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp)
5216 msg.rpc_resp = &calldata->res; 5258 msg.rpc_resp = &calldata->res;
5217 task_setup_data.callback_data = calldata; 5259 task_setup_data.callback_data = calldata;
5218 task = rpc_run_task(&task_setup_data); 5260 task = rpc_run_task(&task_setup_data);
5219 if (IS_ERR(task)) 5261 if (IS_ERR(task)) {
5220 status = PTR_ERR(task); 5262 status = PTR_ERR(task);
5263 goto out;
5264 }
5221 rpc_put_task(task); 5265 rpc_put_task(task);
5266 return 0;
5222out: 5267out:
5223 dprintk("<-- %s status=%d\n", __func__, status); 5268 dprintk("<-- %s status=%d\n", __func__, status);
5224 return status; 5269 return status;
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 6c5ed51f105e..34acf5926fdc 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -62,6 +62,7 @@ static LIST_HEAD(nfs4_clientid_list);
62 62
63int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred) 63int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
64{ 64{
65 struct nfs4_setclientid_res clid;
65 unsigned short port; 66 unsigned short port;
66 int status; 67 int status;
67 68
@@ -69,11 +70,15 @@ int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
69 if (clp->cl_addr.ss_family == AF_INET6) 70 if (clp->cl_addr.ss_family == AF_INET6)
70 port = nfs_callback_tcpport6; 71 port = nfs_callback_tcpport6;
71 72
72 status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred); 73 status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred, &clid);
73 if (status == 0) 74 if (status != 0)
74 status = nfs4_proc_setclientid_confirm(clp, cred); 75 goto out;
75 if (status == 0) 76 status = nfs4_proc_setclientid_confirm(clp, &clid, cred);
76 nfs4_schedule_state_renewal(clp); 77 if (status != 0)
78 goto out;
79 clp->cl_clientid = clid.clientid;
80 nfs4_schedule_state_renewal(clp);
81out:
77 return status; 82 return status;
78} 83}
79 84
@@ -361,7 +366,7 @@ nfs4_alloc_state_owner(void)
361{ 366{
362 struct nfs4_state_owner *sp; 367 struct nfs4_state_owner *sp;
363 368
364 sp = kzalloc(sizeof(*sp),GFP_KERNEL); 369 sp = kzalloc(sizeof(*sp),GFP_NOFS);
365 if (!sp) 370 if (!sp)
366 return NULL; 371 return NULL;
367 spin_lock_init(&sp->so_lock); 372 spin_lock_init(&sp->so_lock);
@@ -435,7 +440,7 @@ nfs4_alloc_open_state(void)
435{ 440{
436 struct nfs4_state *state; 441 struct nfs4_state *state;
437 442
438 state = kzalloc(sizeof(*state), GFP_KERNEL); 443 state = kzalloc(sizeof(*state), GFP_NOFS);
439 if (!state) 444 if (!state)
440 return NULL; 445 return NULL;
441 atomic_set(&state->count, 1); 446 atomic_set(&state->count, 1);
@@ -537,7 +542,8 @@ void nfs4_put_open_state(struct nfs4_state *state)
537/* 542/*
538 * Close the current file. 543 * Close the current file.
539 */ 544 */
540static void __nfs4_close(struct path *path, struct nfs4_state *state, fmode_t fmode, int wait) 545static void __nfs4_close(struct path *path, struct nfs4_state *state,
546 fmode_t fmode, gfp_t gfp_mask, int wait)
541{ 547{
542 struct nfs4_state_owner *owner = state->owner; 548 struct nfs4_state_owner *owner = state->owner;
543 int call_close = 0; 549 int call_close = 0;
@@ -578,17 +584,17 @@ static void __nfs4_close(struct path *path, struct nfs4_state *state, fmode_t fm
578 nfs4_put_open_state(state); 584 nfs4_put_open_state(state);
579 nfs4_put_state_owner(owner); 585 nfs4_put_state_owner(owner);
580 } else 586 } else
581 nfs4_do_close(path, state, wait); 587 nfs4_do_close(path, state, gfp_mask, wait);
582} 588}
583 589
584void nfs4_close_state(struct path *path, struct nfs4_state *state, fmode_t fmode) 590void nfs4_close_state(struct path *path, struct nfs4_state *state, fmode_t fmode)
585{ 591{
586 __nfs4_close(path, state, fmode, 0); 592 __nfs4_close(path, state, fmode, GFP_NOFS, 0);
587} 593}
588 594
589void nfs4_close_sync(struct path *path, struct nfs4_state *state, fmode_t fmode) 595void nfs4_close_sync(struct path *path, struct nfs4_state *state, fmode_t fmode)
590{ 596{
591 __nfs4_close(path, state, fmode, 1); 597 __nfs4_close(path, state, fmode, GFP_KERNEL, 1);
592} 598}
593 599
594/* 600/*
@@ -618,7 +624,7 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
618 struct nfs4_lock_state *lsp; 624 struct nfs4_lock_state *lsp;
619 struct nfs_client *clp = state->owner->so_client; 625 struct nfs_client *clp = state->owner->so_client;
620 626
621 lsp = kzalloc(sizeof(*lsp), GFP_KERNEL); 627 lsp = kzalloc(sizeof(*lsp), GFP_NOFS);
622 if (lsp == NULL) 628 if (lsp == NULL)
623 return NULL; 629 return NULL;
624 rpc_init_wait_queue(&lsp->ls_sequence.wait, "lock_seqid_waitqueue"); 630 rpc_init_wait_queue(&lsp->ls_sequence.wait, "lock_seqid_waitqueue");
@@ -754,11 +760,11 @@ void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t f
754 nfs4_put_lock_state(lsp); 760 nfs4_put_lock_state(lsp);
755} 761}
756 762
757struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter) 763struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask)
758{ 764{
759 struct nfs_seqid *new; 765 struct nfs_seqid *new;
760 766
761 new = kmalloc(sizeof(*new), GFP_KERNEL); 767 new = kmalloc(sizeof(*new), gfp_mask);
762 if (new != NULL) { 768 if (new != NULL) {
763 new->sequence = counter; 769 new->sequence = counter;
764 INIT_LIST_HEAD(&new->list); 770 INIT_LIST_HEAD(&new->list);
@@ -1347,7 +1353,7 @@ static int nfs4_recall_slot(struct nfs_client *clp)
1347 1353
1348 nfs4_begin_drain_session(clp); 1354 nfs4_begin_drain_session(clp);
1349 new = kmalloc(fc_tbl->target_max_slots * sizeof(struct nfs4_slot), 1355 new = kmalloc(fc_tbl->target_max_slots * sizeof(struct nfs4_slot),
1350 GFP_KERNEL); 1356 GFP_NOFS);
1351 if (!new) 1357 if (!new)
1352 return -ENOMEM; 1358 return -ENOMEM;
1353 1359
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 4d338be492cb..6bdef28efa33 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -38,7 +38,6 @@
38#include <linux/param.h> 38#include <linux/param.h>
39#include <linux/time.h> 39#include <linux/time.h>
40#include <linux/mm.h> 40#include <linux/mm.h>
41#include <linux/slab.h>
42#include <linux/errno.h> 41#include <linux/errno.h>
43#include <linux/string.h> 42#include <linux/string.h>
44#include <linux/in.h> 43#include <linux/in.h>
@@ -1505,14 +1504,14 @@ static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclie
1505 hdr->replen += decode_setclientid_maxsz; 1504 hdr->replen += decode_setclientid_maxsz;
1506} 1505}
1507 1506
1508static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_client *client_state, struct compound_hdr *hdr) 1507static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs4_setclientid_res *arg, struct compound_hdr *hdr)
1509{ 1508{
1510 __be32 *p; 1509 __be32 *p;
1511 1510
1512 p = reserve_space(xdr, 12 + NFS4_VERIFIER_SIZE); 1511 p = reserve_space(xdr, 12 + NFS4_VERIFIER_SIZE);
1513 *p++ = cpu_to_be32(OP_SETCLIENTID_CONFIRM); 1512 *p++ = cpu_to_be32(OP_SETCLIENTID_CONFIRM);
1514 p = xdr_encode_hyper(p, client_state->cl_clientid); 1513 p = xdr_encode_hyper(p, arg->clientid);
1515 xdr_encode_opaque_fixed(p, client_state->cl_confirm.data, NFS4_VERIFIER_SIZE); 1514 xdr_encode_opaque_fixed(p, arg->confirm.data, NFS4_VERIFIER_SIZE);
1516 hdr->nops++; 1515 hdr->nops++;
1517 hdr->replen += decode_setclientid_confirm_maxsz; 1516 hdr->replen += decode_setclientid_confirm_maxsz;
1518} 1517}
@@ -2325,7 +2324,7 @@ static int nfs4_xdr_enc_setclientid(struct rpc_rqst *req, __be32 *p, struct nfs4
2325/* 2324/*
2326 * a SETCLIENTID_CONFIRM request 2325 * a SETCLIENTID_CONFIRM request
2327 */ 2326 */
2328static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, __be32 *p, struct nfs_client *clp) 2327static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, __be32 *p, struct nfs4_setclientid_res *arg)
2329{ 2328{
2330 struct xdr_stream xdr; 2329 struct xdr_stream xdr;
2331 struct compound_hdr hdr = { 2330 struct compound_hdr hdr = {
@@ -2335,7 +2334,7 @@ static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, __be32 *p, str
2335 2334
2336 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2335 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
2337 encode_compound_hdr(&xdr, req, &hdr); 2336 encode_compound_hdr(&xdr, req, &hdr);
2338 encode_setclientid_confirm(&xdr, clp, &hdr); 2337 encode_setclientid_confirm(&xdr, arg, &hdr);
2339 encode_putrootfh(&xdr, &hdr); 2338 encode_putrootfh(&xdr, &hdr);
2340 encode_fsinfo(&xdr, lease_bitmap, &hdr); 2339 encode_fsinfo(&xdr, lease_bitmap, &hdr);
2341 encode_nops(&hdr); 2340 encode_nops(&hdr);
@@ -4398,7 +4397,7 @@ out_overflow:
4398 return -EIO; 4397 return -EIO;
4399} 4398}
4400 4399
4401static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp) 4400static int decode_setclientid(struct xdr_stream *xdr, struct nfs4_setclientid_res *res)
4402{ 4401{
4403 __be32 *p; 4402 __be32 *p;
4404 uint32_t opnum; 4403 uint32_t opnum;
@@ -4418,8 +4417,8 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
4418 p = xdr_inline_decode(xdr, 8 + NFS4_VERIFIER_SIZE); 4417 p = xdr_inline_decode(xdr, 8 + NFS4_VERIFIER_SIZE);
4419 if (unlikely(!p)) 4418 if (unlikely(!p))
4420 goto out_overflow; 4419 goto out_overflow;
4421 p = xdr_decode_hyper(p, &clp->cl_clientid); 4420 p = xdr_decode_hyper(p, &res->clientid);
4422 memcpy(clp->cl_confirm.data, p, NFS4_VERIFIER_SIZE); 4421 memcpy(res->confirm.data, p, NFS4_VERIFIER_SIZE);
4423 } else if (nfserr == NFSERR_CLID_INUSE) { 4422 } else if (nfserr == NFSERR_CLID_INUSE) {
4424 uint32_t len; 4423 uint32_t len;
4425 4424
@@ -4816,7 +4815,7 @@ static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, __be32 *p, struct nfs_rem
4816 goto out; 4815 goto out;
4817 if ((status = decode_remove(&xdr, &res->cinfo)) != 0) 4816 if ((status = decode_remove(&xdr, &res->cinfo)) != 0)
4818 goto out; 4817 goto out;
4819 decode_getfattr(&xdr, &res->dir_attr, res->server, 4818 decode_getfattr(&xdr, res->dir_attr, res->server,
4820 !RPC_IS_ASYNC(rqstp->rq_task)); 4819 !RPC_IS_ASYNC(rqstp->rq_task));
4821out: 4820out:
4822 return status; 4821 return status;
@@ -5499,7 +5498,7 @@ static int nfs4_xdr_dec_renew(struct rpc_rqst *rqstp, __be32 *p, void *dummy)
5499 * Decode SETCLIENTID response 5498 * Decode SETCLIENTID response
5500 */ 5499 */
5501static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, __be32 *p, 5500static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, __be32 *p,
5502 struct nfs_client *clp) 5501 struct nfs4_setclientid_res *res)
5503{ 5502{
5504 struct xdr_stream xdr; 5503 struct xdr_stream xdr;
5505 struct compound_hdr hdr; 5504 struct compound_hdr hdr;
@@ -5508,7 +5507,7 @@ static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, __be32 *p,
5508 xdr_init_decode(&xdr, &req->rq_rcv_buf, p); 5507 xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
5509 status = decode_compound_hdr(&xdr, &hdr); 5508 status = decode_compound_hdr(&xdr, &hdr);
5510 if (!status) 5509 if (!status)
5511 status = decode_setclientid(&xdr, clp); 5510 status = decode_setclientid(&xdr, res);
5512 return status; 5511 return status;
5513} 5512}
5514 5513
@@ -5552,6 +5551,8 @@ static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, __be32 *p, struct nf
5552 if (status != 0) 5551 if (status != 0)
5553 goto out; 5552 goto out;
5554 status = decode_delegreturn(&xdr); 5553 status = decode_delegreturn(&xdr);
5554 if (status != 0)
5555 goto out;
5555 decode_getfattr(&xdr, res->fattr, res->server, 5556 decode_getfattr(&xdr, res->fattr, res->server,
5556 !RPC_IS_ASYNC(rqstp->rq_task)); 5557 !RPC_IS_ASYNC(rqstp->rq_task));
5557out: 5558out:
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 8c55b27c0de4..6bd19d843af7 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -488,7 +488,6 @@ static int __init root_nfs_ports(void)
488 */ 488 */
489static int __init root_nfs_get_handle(void) 489static int __init root_nfs_get_handle(void)
490{ 490{
491 struct nfs_fh fh;
492 struct sockaddr_in sin; 491 struct sockaddr_in sin;
493 unsigned int auth_flav_len = 0; 492 unsigned int auth_flav_len = 0;
494 struct nfs_mount_request request = { 493 struct nfs_mount_request request = {
@@ -499,21 +498,24 @@ static int __init root_nfs_get_handle(void)
499 NFS_MNT3_VERSION : NFS_MNT_VERSION, 498 NFS_MNT3_VERSION : NFS_MNT_VERSION,
500 .protocol = (nfs_data.flags & NFS_MOUNT_TCP) ? 499 .protocol = (nfs_data.flags & NFS_MOUNT_TCP) ?
501 XPRT_TRANSPORT_TCP : XPRT_TRANSPORT_UDP, 500 XPRT_TRANSPORT_TCP : XPRT_TRANSPORT_UDP,
502 .fh = &fh,
503 .auth_flav_len = &auth_flav_len, 501 .auth_flav_len = &auth_flav_len,
504 }; 502 };
505 int status; 503 int status = -ENOMEM;
506 504
505 request.fh = nfs_alloc_fhandle();
506 if (!request.fh)
507 goto out;
507 set_sockaddr(&sin, servaddr, htons(mount_port)); 508 set_sockaddr(&sin, servaddr, htons(mount_port));
508 status = nfs_mount(&request); 509 status = nfs_mount(&request);
509 if (status < 0) 510 if (status < 0)
510 printk(KERN_ERR "Root-NFS: Server returned error %d " 511 printk(KERN_ERR "Root-NFS: Server returned error %d "
511 "while mounting %s\n", status, nfs_export_path); 512 "while mounting %s\n", status, nfs_export_path);
512 else { 513 else {
513 nfs_data.root.size = fh.size; 514 nfs_data.root.size = request.fh->size;
514 memcpy(nfs_data.root.data, fh.data, fh.size); 515 memcpy(&nfs_data.root.data, request.fh->data, request.fh->size);
515 } 516 }
516 517 nfs_free_fhandle(request.fh);
518out:
517 return status; 519 return status;
518} 520}
519 521
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 29d9d36cd5f4..a3654e57b589 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -60,16 +60,10 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
60{ 60{
61 struct nfs_page *req; 61 struct nfs_page *req;
62 62
63 for (;;) { 63 /* try to allocate the request struct */
64 /* try to allocate the request struct */ 64 req = nfs_page_alloc();
65 req = nfs_page_alloc(); 65 if (req == NULL)
66 if (req != NULL) 66 return ERR_PTR(-ENOMEM);
67 break;
68
69 if (fatal_signal_pending(current))
70 return ERR_PTR(-ERESTARTSYS);
71 yield();
72 }
73 67
74 /* Initialize the request struct. Initially, we assume a 68 /* Initialize the request struct. Initially, we assume a
75 * long write-back delay. This will be adjusted in 69 * long write-back delay. This will be adjusted in
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index c752d944fe9e..611bec22f552 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -29,7 +29,6 @@
29 29
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/param.h> 31#include <linux/param.h>
32#include <linux/slab.h>
33#include <linux/time.h> 32#include <linux/time.h>
34#include <linux/mm.h> 33#include <linux/mm.h>
35#include <linux/errno.h> 34#include <linux/errno.h>
@@ -225,35 +224,60 @@ static int nfs_proc_readlink(struct inode *inode, struct page *page,
225 return status; 224 return status;
226} 225}
227 226
227struct nfs_createdata {
228 struct nfs_createargs arg;
229 struct nfs_diropok res;
230 struct nfs_fh fhandle;
231 struct nfs_fattr fattr;
232};
233
234static struct nfs_createdata *nfs_alloc_createdata(struct inode *dir,
235 struct dentry *dentry, struct iattr *sattr)
236{
237 struct nfs_createdata *data;
238
239 data = kmalloc(sizeof(*data), GFP_KERNEL);
240
241 if (data != NULL) {
242 data->arg.fh = NFS_FH(dir);
243 data->arg.name = dentry->d_name.name;
244 data->arg.len = dentry->d_name.len;
245 data->arg.sattr = sattr;
246 nfs_fattr_init(&data->fattr);
247 data->fhandle.size = 0;
248 data->res.fh = &data->fhandle;
249 data->res.fattr = &data->fattr;
250 }
251 return data;
252};
253
254static void nfs_free_createdata(const struct nfs_createdata *data)
255{
256 kfree(data);
257}
258
228static int 259static int
229nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, 260nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
230 int flags, struct nameidata *nd) 261 int flags, struct nameidata *nd)
231{ 262{
232 struct nfs_fh fhandle; 263 struct nfs_createdata *data;
233 struct nfs_fattr fattr;
234 struct nfs_createargs arg = {
235 .fh = NFS_FH(dir),
236 .name = dentry->d_name.name,
237 .len = dentry->d_name.len,
238 .sattr = sattr
239 };
240 struct nfs_diropok res = {
241 .fh = &fhandle,
242 .fattr = &fattr
243 };
244 struct rpc_message msg = { 264 struct rpc_message msg = {
245 .rpc_proc = &nfs_procedures[NFSPROC_CREATE], 265 .rpc_proc = &nfs_procedures[NFSPROC_CREATE],
246 .rpc_argp = &arg,
247 .rpc_resp = &res,
248 }; 266 };
249 int status; 267 int status = -ENOMEM;
250 268
251 nfs_fattr_init(&fattr);
252 dprintk("NFS call create %s\n", dentry->d_name.name); 269 dprintk("NFS call create %s\n", dentry->d_name.name);
270 data = nfs_alloc_createdata(dir, dentry, sattr);
271 if (data == NULL)
272 goto out;
273 msg.rpc_argp = &data->arg;
274 msg.rpc_resp = &data->res;
253 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 275 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
254 nfs_mark_for_revalidate(dir); 276 nfs_mark_for_revalidate(dir);
255 if (status == 0) 277 if (status == 0)
256 status = nfs_instantiate(dentry, &fhandle, &fattr); 278 status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
279 nfs_free_createdata(data);
280out:
257 dprintk("NFS reply create: %d\n", status); 281 dprintk("NFS reply create: %d\n", status);
258 return status; 282 return status;
259} 283}
@@ -265,24 +289,12 @@ static int
265nfs_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, 289nfs_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
266 dev_t rdev) 290 dev_t rdev)
267{ 291{
268 struct nfs_fh fhandle; 292 struct nfs_createdata *data;
269 struct nfs_fattr fattr;
270 struct nfs_createargs arg = {
271 .fh = NFS_FH(dir),
272 .name = dentry->d_name.name,
273 .len = dentry->d_name.len,
274 .sattr = sattr
275 };
276 struct nfs_diropok res = {
277 .fh = &fhandle,
278 .fattr = &fattr
279 };
280 struct rpc_message msg = { 293 struct rpc_message msg = {
281 .rpc_proc = &nfs_procedures[NFSPROC_CREATE], 294 .rpc_proc = &nfs_procedures[NFSPROC_CREATE],
282 .rpc_argp = &arg,
283 .rpc_resp = &res,
284 }; 295 };
285 int status, mode; 296 umode_t mode;
297 int status = -ENOMEM;
286 298
287 dprintk("NFS call mknod %s\n", dentry->d_name.name); 299 dprintk("NFS call mknod %s\n", dentry->d_name.name);
288 300
@@ -295,17 +307,24 @@ nfs_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
295 sattr->ia_size = new_encode_dev(rdev);/* get out your barf bag */ 307 sattr->ia_size = new_encode_dev(rdev);/* get out your barf bag */
296 } 308 }
297 309
298 nfs_fattr_init(&fattr); 310 data = nfs_alloc_createdata(dir, dentry, sattr);
311 if (data == NULL)
312 goto out;
313 msg.rpc_argp = &data->arg;
314 msg.rpc_resp = &data->res;
315
299 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 316 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
300 nfs_mark_for_revalidate(dir); 317 nfs_mark_for_revalidate(dir);
301 318
302 if (status == -EINVAL && S_ISFIFO(mode)) { 319 if (status == -EINVAL && S_ISFIFO(mode)) {
303 sattr->ia_mode = mode; 320 sattr->ia_mode = mode;
304 nfs_fattr_init(&fattr); 321 nfs_fattr_init(data->res.fattr);
305 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 322 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
306 } 323 }
307 if (status == 0) 324 if (status == 0)
308 status = nfs_instantiate(dentry, &fhandle, &fattr); 325 status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
326 nfs_free_createdata(data);
327out:
309 dprintk("NFS reply mknod: %d\n", status); 328 dprintk("NFS reply mknod: %d\n", status);
310 return status; 329 return status;
311} 330}
@@ -399,8 +418,8 @@ static int
399nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page, 418nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
400 unsigned int len, struct iattr *sattr) 419 unsigned int len, struct iattr *sattr)
401{ 420{
402 struct nfs_fh fhandle; 421 struct nfs_fh *fh;
403 struct nfs_fattr fattr; 422 struct nfs_fattr *fattr;
404 struct nfs_symlinkargs arg = { 423 struct nfs_symlinkargs arg = {
405 .fromfh = NFS_FH(dir), 424 .fromfh = NFS_FH(dir),
406 .fromname = dentry->d_name.name, 425 .fromname = dentry->d_name.name,
@@ -413,12 +432,18 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
413 .rpc_proc = &nfs_procedures[NFSPROC_SYMLINK], 432 .rpc_proc = &nfs_procedures[NFSPROC_SYMLINK],
414 .rpc_argp = &arg, 433 .rpc_argp = &arg,
415 }; 434 };
416 int status; 435 int status = -ENAMETOOLONG;
436
437 dprintk("NFS call symlink %s\n", dentry->d_name.name);
417 438
418 if (len > NFS2_MAXPATHLEN) 439 if (len > NFS2_MAXPATHLEN)
419 return -ENAMETOOLONG; 440 goto out;
420 441
421 dprintk("NFS call symlink %s\n", dentry->d_name.name); 442 fh = nfs_alloc_fhandle();
443 fattr = nfs_alloc_fattr();
444 status = -ENOMEM;
445 if (fh == NULL || fattr == NULL)
446 goto out;
422 447
423 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 448 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
424 nfs_mark_for_revalidate(dir); 449 nfs_mark_for_revalidate(dir);
@@ -428,12 +453,12 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
428 * filehandle size to zero indicates to nfs_instantiate that it 453 * filehandle size to zero indicates to nfs_instantiate that it
429 * should fill in the data with a LOOKUP call on the wire. 454 * should fill in the data with a LOOKUP call on the wire.
430 */ 455 */
431 if (status == 0) { 456 if (status == 0)
432 nfs_fattr_init(&fattr); 457 status = nfs_instantiate(dentry, fh, fattr);
433 fhandle.size = 0;
434 status = nfs_instantiate(dentry, &fhandle, &fattr);
435 }
436 458
459 nfs_free_fattr(fattr);
460 nfs_free_fhandle(fh);
461out:
437 dprintk("NFS reply symlink: %d\n", status); 462 dprintk("NFS reply symlink: %d\n", status);
438 return status; 463 return status;
439} 464}
@@ -441,31 +466,25 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
441static int 466static int
442nfs_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) 467nfs_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
443{ 468{
444 struct nfs_fh fhandle; 469 struct nfs_createdata *data;
445 struct nfs_fattr fattr;
446 struct nfs_createargs arg = {
447 .fh = NFS_FH(dir),
448 .name = dentry->d_name.name,
449 .len = dentry->d_name.len,
450 .sattr = sattr
451 };
452 struct nfs_diropok res = {
453 .fh = &fhandle,
454 .fattr = &fattr
455 };
456 struct rpc_message msg = { 470 struct rpc_message msg = {
457 .rpc_proc = &nfs_procedures[NFSPROC_MKDIR], 471 .rpc_proc = &nfs_procedures[NFSPROC_MKDIR],
458 .rpc_argp = &arg,
459 .rpc_resp = &res,
460 }; 472 };
461 int status; 473 int status = -ENOMEM;
462 474
463 dprintk("NFS call mkdir %s\n", dentry->d_name.name); 475 dprintk("NFS call mkdir %s\n", dentry->d_name.name);
464 nfs_fattr_init(&fattr); 476 data = nfs_alloc_createdata(dir, dentry, sattr);
477 if (data == NULL)
478 goto out;
479 msg.rpc_argp = &data->arg;
480 msg.rpc_resp = &data->res;
481
465 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 482 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
466 nfs_mark_for_revalidate(dir); 483 nfs_mark_for_revalidate(dir);
467 if (status == 0) 484 if (status == 0)
468 status = nfs_instantiate(dentry, &fhandle, &fattr); 485 status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
486 nfs_free_createdata(data);
487out:
469 dprintk("NFS reply mkdir: %d\n", status); 488 dprintk("NFS reply mkdir: %d\n", status);
470 return status; 489 return status;
471} 490}
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index db9b360ae19d..6e2b06e6ca79 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -40,7 +40,7 @@ static mempool_t *nfs_rdata_mempool;
40 40
41struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount) 41struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
42{ 42{
43 struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, GFP_NOFS); 43 struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, GFP_KERNEL);
44 44
45 if (p) { 45 if (p) {
46 memset(p, 0, sizeof(*p)); 46 memset(p, 0, sizeof(*p));
@@ -50,7 +50,7 @@ struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
50 if (pagecount <= ARRAY_SIZE(p->page_array)) 50 if (pagecount <= ARRAY_SIZE(p->page_array))
51 p->pagevec = p->page_array; 51 p->pagevec = p->page_array;
52 else { 52 else {
53 p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_NOFS); 53 p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL);
54 if (!p->pagevec) { 54 if (!p->pagevec) {
55 mempool_free(p, nfs_rdata_mempool); 55 mempool_free(p, nfs_rdata_mempool);
56 p = NULL; 56 p = NULL;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 6baf9a393466..04214fc5c304 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -48,6 +48,7 @@
48#include <linux/vfs.h> 48#include <linux/vfs.h>
49#include <linux/inet.h> 49#include <linux/inet.h>
50#include <linux/in6.h> 50#include <linux/in6.h>
51#include <linux/slab.h>
51#include <net/ipv6.h> 52#include <net/ipv6.h>
52#include <linux/netdevice.h> 53#include <linux/netdevice.h>
53#include <linux/nfs_xdr.h> 54#include <linux/nfs_xdr.h>
@@ -140,7 +141,6 @@ static const match_table_t nfs_mount_option_tokens = {
140 { Opt_resvport, "resvport" }, 141 { Opt_resvport, "resvport" },
141 { Opt_noresvport, "noresvport" }, 142 { Opt_noresvport, "noresvport" },
142 { Opt_fscache, "fsc" }, 143 { Opt_fscache, "fsc" },
143 { Opt_fscache_uniq, "fsc=%s" },
144 { Opt_nofscache, "nofsc" }, 144 { Opt_nofscache, "nofsc" },
145 145
146 { Opt_port, "port=%s" }, 146 { Opt_port, "port=%s" },
@@ -170,6 +170,7 @@ static const match_table_t nfs_mount_option_tokens = {
170 { Opt_mountaddr, "mountaddr=%s" }, 170 { Opt_mountaddr, "mountaddr=%s" },
171 171
172 { Opt_lookupcache, "lookupcache=%s" }, 172 { Opt_lookupcache, "lookupcache=%s" },
173 { Opt_fscache_uniq, "fsc=%s" },
173 174
174 { Opt_err, NULL } 175 { Opt_err, NULL }
175}; 176};
@@ -422,15 +423,19 @@ static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf)
422 unsigned char blockbits; 423 unsigned char blockbits;
423 unsigned long blockres; 424 unsigned long blockres;
424 struct nfs_fh *fh = NFS_FH(dentry->d_inode); 425 struct nfs_fh *fh = NFS_FH(dentry->d_inode);
425 struct nfs_fattr fattr; 426 struct nfs_fsstat res;
426 struct nfs_fsstat res = { 427 int error = -ENOMEM;
427 .fattr = &fattr, 428
428 }; 429 res.fattr = nfs_alloc_fattr();
429 int error; 430 if (res.fattr == NULL)
431 goto out_err;
430 432
431 error = server->nfs_client->rpc_ops->statfs(server, fh, &res); 433 error = server->nfs_client->rpc_ops->statfs(server, fh, &res);
434
435 nfs_free_fattr(res.fattr);
432 if (error < 0) 436 if (error < 0)
433 goto out_err; 437 goto out_err;
438
434 buf->f_type = NFS_SUPER_MAGIC; 439 buf->f_type = NFS_SUPER_MAGIC;
435 440
436 /* 441 /*
@@ -1045,14 +1050,6 @@ static int nfs_parse_mount_options(char *raw,
1045 kfree(mnt->fscache_uniq); 1050 kfree(mnt->fscache_uniq);
1046 mnt->fscache_uniq = NULL; 1051 mnt->fscache_uniq = NULL;
1047 break; 1052 break;
1048 case Opt_fscache_uniq:
1049 string = match_strdup(args);
1050 if (!string)
1051 goto out_nomem;
1052 kfree(mnt->fscache_uniq);
1053 mnt->fscache_uniq = string;
1054 mnt->options |= NFS_OPTION_FSCACHE;
1055 break;
1056 1053
1057 /* 1054 /*
1058 * options that take numeric values 1055 * options that take numeric values
@@ -1063,7 +1060,7 @@ static int nfs_parse_mount_options(char *raw,
1063 goto out_nomem; 1060 goto out_nomem;
1064 rc = strict_strtoul(string, 10, &option); 1061 rc = strict_strtoul(string, 10, &option);
1065 kfree(string); 1062 kfree(string);
1066 if (rc != 0 || option > USHORT_MAX) 1063 if (rc != 0 || option > USHRT_MAX)
1067 goto out_invalid_value; 1064 goto out_invalid_value;
1068 mnt->nfs_server.port = option; 1065 mnt->nfs_server.port = option;
1069 break; 1066 break;
@@ -1184,7 +1181,7 @@ static int nfs_parse_mount_options(char *raw,
1184 goto out_nomem; 1181 goto out_nomem;
1185 rc = strict_strtoul(string, 10, &option); 1182 rc = strict_strtoul(string, 10, &option);
1186 kfree(string); 1183 kfree(string);
1187 if (rc != 0 || option > USHORT_MAX) 1184 if (rc != 0 || option > USHRT_MAX)
1188 goto out_invalid_value; 1185 goto out_invalid_value;
1189 mnt->mount_server.port = option; 1186 mnt->mount_server.port = option;
1190 break; 1187 break;
@@ -1383,6 +1380,14 @@ static int nfs_parse_mount_options(char *raw,
1383 return 0; 1380 return 0;
1384 }; 1381 };
1385 break; 1382 break;
1383 case Opt_fscache_uniq:
1384 string = match_strdup(args);
1385 if (string == NULL)
1386 goto out_nomem;
1387 kfree(mnt->fscache_uniq);
1388 mnt->fscache_uniq = string;
1389 mnt->options |= NFS_OPTION_FSCACHE;
1390 break;
1386 1391
1387 /* 1392 /*
1388 * Special options 1393 * Special options
@@ -2171,7 +2176,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
2171 int error = -ENOMEM; 2176 int error = -ENOMEM;
2172 2177
2173 data = nfs_alloc_parsed_mount_data(3); 2178 data = nfs_alloc_parsed_mount_data(3);
2174 mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL); 2179 mntfh = nfs_alloc_fhandle();
2175 if (data == NULL || mntfh == NULL) 2180 if (data == NULL || mntfh == NULL)
2176 goto out_free_fh; 2181 goto out_free_fh;
2177 2182
@@ -2186,6 +2191,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
2186 if (data->version == 4) { 2191 if (data->version == 4) {
2187 error = nfs4_try_mount(flags, dev_name, data, mnt); 2192 error = nfs4_try_mount(flags, dev_name, data, mnt);
2188 kfree(data->client_address); 2193 kfree(data->client_address);
2194 kfree(data->nfs_server.export_path);
2189 goto out; 2195 goto out;
2190 } 2196 }
2191#endif /* CONFIG_NFS_V4 */ 2197#endif /* CONFIG_NFS_V4 */
@@ -2245,7 +2251,7 @@ out:
2245 kfree(data->fscache_uniq); 2251 kfree(data->fscache_uniq);
2246 security_free_mnt_opts(&data->lsm_opts); 2252 security_free_mnt_opts(&data->lsm_opts);
2247out_free_fh: 2253out_free_fh:
2248 kfree(mntfh); 2254 nfs_free_fhandle(mntfh);
2249 kfree(data); 2255 kfree(data);
2250 return error; 2256 return error;
2251 2257
@@ -2554,7 +2560,7 @@ static int nfs4_remote_get_sb(struct file_system_type *fs_type,
2554 }; 2560 };
2555 int error = -ENOMEM; 2561 int error = -ENOMEM;
2556 2562
2557 mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL); 2563 mntfh = nfs_alloc_fhandle();
2558 if (data == NULL || mntfh == NULL) 2564 if (data == NULL || mntfh == NULL)
2559 goto out_free_fh; 2565 goto out_free_fh;
2560 2566
@@ -2612,7 +2618,7 @@ static int nfs4_remote_get_sb(struct file_system_type *fs_type,
2612out: 2618out:
2613 security_free_mnt_opts(&data->lsm_opts); 2619 security_free_mnt_opts(&data->lsm_opts);
2614out_free_fh: 2620out_free_fh:
2615 kfree(mntfh); 2621 nfs_free_fhandle(mntfh);
2616 return error; 2622 return error;
2617 2623
2618out_free: 2624out_free:
@@ -2656,7 +2662,7 @@ static void nfs_fix_devname(const struct path *path, struct vfsmount *mnt)
2656 devname = nfs_path(path->mnt->mnt_devname, 2662 devname = nfs_path(path->mnt->mnt_devname,
2657 path->mnt->mnt_root, path->dentry, 2663 path->mnt->mnt_root, path->dentry,
2658 page, PAGE_SIZE); 2664 page, PAGE_SIZE);
2659 if (devname == NULL) 2665 if (IS_ERR(devname))
2660 goto out_freepage; 2666 goto out_freepage;
2661 tmp = kstrdup(devname, GFP_KERNEL); 2667 tmp = kstrdup(devname, GFP_KERNEL);
2662 if (tmp == NULL) 2668 if (tmp == NULL)
@@ -2667,41 +2673,120 @@ out_freepage:
2667 free_page((unsigned long)page); 2673 free_page((unsigned long)page);
2668} 2674}
2669 2675
2676struct nfs_referral_count {
2677 struct list_head list;
2678 const struct task_struct *task;
2679 unsigned int referral_count;
2680};
2681
2682static LIST_HEAD(nfs_referral_count_list);
2683static DEFINE_SPINLOCK(nfs_referral_count_list_lock);
2684
2685static struct nfs_referral_count *nfs_find_referral_count(void)
2686{
2687 struct nfs_referral_count *p;
2688
2689 list_for_each_entry(p, &nfs_referral_count_list, list) {
2690 if (p->task == current)
2691 return p;
2692 }
2693 return NULL;
2694}
2695
2696#define NFS_MAX_NESTED_REFERRALS 2
2697
2698static int nfs_referral_loop_protect(void)
2699{
2700 struct nfs_referral_count *p, *new;
2701 int ret = -ENOMEM;
2702
2703 new = kmalloc(sizeof(*new), GFP_KERNEL);
2704 if (!new)
2705 goto out;
2706 new->task = current;
2707 new->referral_count = 1;
2708
2709 ret = 0;
2710 spin_lock(&nfs_referral_count_list_lock);
2711 p = nfs_find_referral_count();
2712 if (p != NULL) {
2713 if (p->referral_count >= NFS_MAX_NESTED_REFERRALS)
2714 ret = -ELOOP;
2715 else
2716 p->referral_count++;
2717 } else {
2718 list_add(&new->list, &nfs_referral_count_list);
2719 new = NULL;
2720 }
2721 spin_unlock(&nfs_referral_count_list_lock);
2722 kfree(new);
2723out:
2724 return ret;
2725}
2726
2727static void nfs_referral_loop_unprotect(void)
2728{
2729 struct nfs_referral_count *p;
2730
2731 spin_lock(&nfs_referral_count_list_lock);
2732 p = nfs_find_referral_count();
2733 p->referral_count--;
2734 if (p->referral_count == 0)
2735 list_del(&p->list);
2736 else
2737 p = NULL;
2738 spin_unlock(&nfs_referral_count_list_lock);
2739 kfree(p);
2740}
2741
2670static int nfs_follow_remote_path(struct vfsmount *root_mnt, 2742static int nfs_follow_remote_path(struct vfsmount *root_mnt,
2671 const char *export_path, struct vfsmount *mnt_target) 2743 const char *export_path, struct vfsmount *mnt_target)
2672{ 2744{
2745 struct nameidata *nd = NULL;
2673 struct mnt_namespace *ns_private; 2746 struct mnt_namespace *ns_private;
2674 struct nameidata nd;
2675 struct super_block *s; 2747 struct super_block *s;
2676 int ret; 2748 int ret;
2677 2749
2750 nd = kmalloc(sizeof(*nd), GFP_KERNEL);
2751 if (nd == NULL)
2752 return -ENOMEM;
2753
2678 ns_private = create_mnt_ns(root_mnt); 2754 ns_private = create_mnt_ns(root_mnt);
2679 ret = PTR_ERR(ns_private); 2755 ret = PTR_ERR(ns_private);
2680 if (IS_ERR(ns_private)) 2756 if (IS_ERR(ns_private))
2681 goto out_mntput; 2757 goto out_mntput;
2682 2758
2759 ret = nfs_referral_loop_protect();
2760 if (ret != 0)
2761 goto out_put_mnt_ns;
2762
2683 ret = vfs_path_lookup(root_mnt->mnt_root, root_mnt, 2763 ret = vfs_path_lookup(root_mnt->mnt_root, root_mnt,
2684 export_path, LOOKUP_FOLLOW, &nd); 2764 export_path, LOOKUP_FOLLOW, nd);
2685 2765
2766 nfs_referral_loop_unprotect();
2686 put_mnt_ns(ns_private); 2767 put_mnt_ns(ns_private);
2687 2768
2688 if (ret != 0) 2769 if (ret != 0)
2689 goto out_err; 2770 goto out_err;
2690 2771
2691 s = nd.path.mnt->mnt_sb; 2772 s = nd->path.mnt->mnt_sb;
2692 atomic_inc(&s->s_active); 2773 atomic_inc(&s->s_active);
2693 mnt_target->mnt_sb = s; 2774 mnt_target->mnt_sb = s;
2694 mnt_target->mnt_root = dget(nd.path.dentry); 2775 mnt_target->mnt_root = dget(nd->path.dentry);
2695 2776
2696 /* Correct the device pathname */ 2777 /* Correct the device pathname */
2697 nfs_fix_devname(&nd.path, mnt_target); 2778 nfs_fix_devname(&nd->path, mnt_target);
2698 2779
2699 path_put(&nd.path); 2780 path_put(&nd->path);
2781 kfree(nd);
2700 down_write(&s->s_umount); 2782 down_write(&s->s_umount);
2701 return 0; 2783 return 0;
2784out_put_mnt_ns:
2785 put_mnt_ns(ns_private);
2702out_mntput: 2786out_mntput:
2703 mntput(root_mnt); 2787 mntput(root_mnt);
2704out_err: 2788out_err:
2789 kfree(nd);
2705 return ret; 2790 return ret;
2706} 2791}
2707 2792
@@ -2872,17 +2957,21 @@ static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
2872 struct super_block *s; 2957 struct super_block *s;
2873 struct nfs_server *server; 2958 struct nfs_server *server;
2874 struct dentry *mntroot; 2959 struct dentry *mntroot;
2875 struct nfs_fh mntfh; 2960 struct nfs_fh *mntfh;
2876 int (*compare_super)(struct super_block *, void *) = nfs_compare_super; 2961 int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
2877 struct nfs_sb_mountdata sb_mntdata = { 2962 struct nfs_sb_mountdata sb_mntdata = {
2878 .mntflags = flags, 2963 .mntflags = flags,
2879 }; 2964 };
2880 int error; 2965 int error = -ENOMEM;
2881 2966
2882 dprintk("--> nfs4_referral_get_sb()\n"); 2967 dprintk("--> nfs4_referral_get_sb()\n");
2883 2968
2969 mntfh = nfs_alloc_fhandle();
2970 if (mntfh == NULL)
2971 goto out_err_nofh;
2972
2884 /* create a new volume representation */ 2973 /* create a new volume representation */
2885 server = nfs4_create_referral_server(data, &mntfh); 2974 server = nfs4_create_referral_server(data, mntfh);
2886 if (IS_ERR(server)) { 2975 if (IS_ERR(server)) {
2887 error = PTR_ERR(server); 2976 error = PTR_ERR(server);
2888 goto out_err_noserver; 2977 goto out_err_noserver;
@@ -2914,7 +3003,7 @@ static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
2914 nfs_fscache_get_super_cookie(s, NULL, data); 3003 nfs_fscache_get_super_cookie(s, NULL, data);
2915 } 3004 }
2916 3005
2917 mntroot = nfs4_get_root(s, &mntfh); 3006 mntroot = nfs4_get_root(s, mntfh);
2918 if (IS_ERR(mntroot)) { 3007 if (IS_ERR(mntroot)) {
2919 error = PTR_ERR(mntroot); 3008 error = PTR_ERR(mntroot);
2920 goto error_splat_super; 3009 goto error_splat_super;
@@ -2931,12 +3020,15 @@ static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
2931 3020
2932 security_sb_clone_mnt_opts(data->sb, s); 3021 security_sb_clone_mnt_opts(data->sb, s);
2933 3022
3023 nfs_free_fhandle(mntfh);
2934 dprintk("<-- nfs4_referral_get_sb() = 0\n"); 3024 dprintk("<-- nfs4_referral_get_sb() = 0\n");
2935 return 0; 3025 return 0;
2936 3026
2937out_err_nosb: 3027out_err_nosb:
2938 nfs_free_server(server); 3028 nfs_free_server(server);
2939out_err_noserver: 3029out_err_noserver:
3030 nfs_free_fhandle(mntfh);
3031out_err_nofh:
2940 dprintk("<-- nfs4_referral_get_sb() = %d [error]\n", error); 3032 dprintk("<-- nfs4_referral_get_sb() = %d [error]\n", error);
2941 return error; 3033 return error;
2942 3034
@@ -2945,6 +3037,7 @@ error_splat_super:
2945 bdi_unregister(&server->backing_dev_info); 3037 bdi_unregister(&server->backing_dev_info);
2946error_splat_bdi: 3038error_splat_bdi:
2947 deactivate_locked_super(s); 3039 deactivate_locked_super(s);
3040 nfs_free_fhandle(mntfh);
2948 dprintk("<-- nfs4_referral_get_sb() = %d [splat]\n", error); 3041 dprintk("<-- nfs4_referral_get_sb() = %d [splat]\n", error);
2949 return error; 3042 return error;
2950} 3043}
diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c
index 2ea9e5c27e55..05c9e02f4153 100644
--- a/fs/nfs/symlink.c
+++ b/fs/nfs/symlink.c
@@ -19,7 +19,6 @@
19#include <linux/pagemap.h> 19#include <linux/pagemap.h>
20#include <linux/stat.h> 20#include <linux/stat.h>
21#include <linux/mm.h> 21#include <linux/mm.h>
22#include <linux/slab.h>
23#include <linux/string.h> 22#include <linux/string.h>
24#include <linux/namei.h> 23#include <linux/namei.h>
25 24
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 6da3d3ff6edd..a2242af6a17d 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -23,6 +23,7 @@ struct nfs_unlinkdata {
23 struct nfs_removeres res; 23 struct nfs_removeres res;
24 struct inode *dir; 24 struct inode *dir;
25 struct rpc_cred *cred; 25 struct rpc_cred *cred;
26 struct nfs_fattr dir_attr;
26}; 27};
27 28
28/** 29/**
@@ -169,7 +170,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
169 } 170 }
170 nfs_sb_active(dir->i_sb); 171 nfs_sb_active(dir->i_sb);
171 data->args.fh = NFS_FH(dir); 172 data->args.fh = NFS_FH(dir);
172 nfs_fattr_init(&data->res.dir_attr); 173 nfs_fattr_init(data->res.dir_attr);
173 174
174 NFS_PROTO(dir)->unlink_setup(&msg, dir); 175 NFS_PROTO(dir)->unlink_setup(&msg, dir);
175 176
@@ -259,6 +260,7 @@ nfs_async_unlink(struct inode *dir, struct dentry *dentry)
259 goto out_free; 260 goto out_free;
260 } 261 }
261 data->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; 262 data->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
263 data->res.dir_attr = &data->dir_attr;
262 264
263 status = -EBUSY; 265 status = -EBUSY;
264 spin_lock(&dentry->d_lock); 266 spin_lock(&dentry->d_lock);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 53ff70e23993..91679e2631ee 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -201,6 +201,7 @@ static int nfs_set_page_writeback(struct page *page)
201 struct inode *inode = page->mapping->host; 201 struct inode *inode = page->mapping->host;
202 struct nfs_server *nfss = NFS_SERVER(inode); 202 struct nfs_server *nfss = NFS_SERVER(inode);
203 203
204 page_cache_get(page);
204 if (atomic_long_inc_return(&nfss->writeback) > 205 if (atomic_long_inc_return(&nfss->writeback) >
205 NFS_CONGESTION_ON_THRESH) { 206 NFS_CONGESTION_ON_THRESH) {
206 set_bdi_congested(&nfss->backing_dev_info, 207 set_bdi_congested(&nfss->backing_dev_info,
@@ -216,6 +217,7 @@ static void nfs_end_page_writeback(struct page *page)
216 struct nfs_server *nfss = NFS_SERVER(inode); 217 struct nfs_server *nfss = NFS_SERVER(inode);
217 218
218 end_page_writeback(page); 219 end_page_writeback(page);
220 page_cache_release(page);
219 if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) 221 if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
220 clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC); 222 clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
221} 223}
@@ -421,6 +423,7 @@ static void
421nfs_mark_request_dirty(struct nfs_page *req) 423nfs_mark_request_dirty(struct nfs_page *req)
422{ 424{
423 __set_page_dirty_nobuffers(req->wb_page); 425 __set_page_dirty_nobuffers(req->wb_page);
426 __mark_inode_dirty(req->wb_page->mapping->host, I_DIRTY_DATASYNC);
424} 427}
425 428
426#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) 429#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
@@ -660,9 +663,11 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
660 req = nfs_setup_write_request(ctx, page, offset, count); 663 req = nfs_setup_write_request(ctx, page, offset, count);
661 if (IS_ERR(req)) 664 if (IS_ERR(req))
662 return PTR_ERR(req); 665 return PTR_ERR(req);
666 nfs_mark_request_dirty(req);
663 /* Update file length */ 667 /* Update file length */
664 nfs_grow_file(page, offset, count); 668 nfs_grow_file(page, offset, count);
665 nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes); 669 nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes);
670 nfs_mark_request_dirty(req);
666 nfs_clear_page_tag_locked(req); 671 nfs_clear_page_tag_locked(req);
667 return 0; 672 return 0;
668} 673}
@@ -739,8 +744,6 @@ int nfs_updatepage(struct file *file, struct page *page,
739 status = nfs_writepage_setup(ctx, page, offset, count); 744 status = nfs_writepage_setup(ctx, page, offset, count);
740 if (status < 0) 745 if (status < 0)
741 nfs_set_pageerror(page); 746 nfs_set_pageerror(page);
742 else
743 __set_page_dirty_nobuffers(page);
744 747
745 dprintk("NFS: nfs_updatepage returns %d (isize %lld)\n", 748 dprintk("NFS: nfs_updatepage returns %d (isize %lld)\n",
746 status, (long long)i_size_read(inode)); 749 status, (long long)i_size_read(inode));
@@ -749,13 +752,12 @@ int nfs_updatepage(struct file *file, struct page *page,
749 752
750static void nfs_writepage_release(struct nfs_page *req) 753static void nfs_writepage_release(struct nfs_page *req)
751{ 754{
755 struct page *page = req->wb_page;
752 756
753 if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req)) { 757 if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req))
754 nfs_end_page_writeback(req->wb_page);
755 nfs_inode_remove_request(req); 758 nfs_inode_remove_request(req);
756 } else
757 nfs_end_page_writeback(req->wb_page);
758 nfs_clear_page_tag_locked(req); 759 nfs_clear_page_tag_locked(req);
760 nfs_end_page_writeback(page);
759} 761}
760 762
761static int flush_task_priority(int how) 763static int flush_task_priority(int how)
@@ -779,7 +781,6 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
779 int how) 781 int how)
780{ 782{
781 struct inode *inode = req->wb_context->path.dentry->d_inode; 783 struct inode *inode = req->wb_context->path.dentry->d_inode;
782 int flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC;
783 int priority = flush_task_priority(how); 784 int priority = flush_task_priority(how);
784 struct rpc_task *task; 785 struct rpc_task *task;
785 struct rpc_message msg = { 786 struct rpc_message msg = {
@@ -794,9 +795,10 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
794 .callback_ops = call_ops, 795 .callback_ops = call_ops,
795 .callback_data = data, 796 .callback_data = data,
796 .workqueue = nfsiod_workqueue, 797 .workqueue = nfsiod_workqueue,
797 .flags = flags, 798 .flags = RPC_TASK_ASYNC,
798 .priority = priority, 799 .priority = priority,
799 }; 800 };
801 int ret = 0;
800 802
801 /* Set up the RPC argument and reply structs 803 /* Set up the RPC argument and reply structs
802 * NB: take care not to mess about with data->commit et al. */ 804 * NB: take care not to mess about with data->commit et al. */
@@ -835,10 +837,18 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
835 (unsigned long long)data->args.offset); 837 (unsigned long long)data->args.offset);
836 838
837 task = rpc_run_task(&task_setup_data); 839 task = rpc_run_task(&task_setup_data);
838 if (IS_ERR(task)) 840 if (IS_ERR(task)) {
839 return PTR_ERR(task); 841 ret = PTR_ERR(task);
842 goto out;
843 }
844 if (how & FLUSH_SYNC) {
845 ret = rpc_wait_for_completion_task(task);
846 if (ret == 0)
847 ret = task->tk_status;
848 }
840 rpc_put_task(task); 849 rpc_put_task(task);
841 return 0; 850out:
851 return ret;
842} 852}
843 853
844/* If a nfs_flush_* function fails, it should remove reqs from @head and 854/* If a nfs_flush_* function fails, it should remove reqs from @head and
@@ -847,9 +857,11 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
847 */ 857 */
848static void nfs_redirty_request(struct nfs_page *req) 858static void nfs_redirty_request(struct nfs_page *req)
849{ 859{
860 struct page *page = req->wb_page;
861
850 nfs_mark_request_dirty(req); 862 nfs_mark_request_dirty(req);
851 nfs_end_page_writeback(req->wb_page);
852 nfs_clear_page_tag_locked(req); 863 nfs_clear_page_tag_locked(req);
864 nfs_end_page_writeback(page);
853} 865}
854 866
855/* 867/*
@@ -1084,16 +1096,15 @@ static void nfs_writeback_release_full(void *calldata)
1084 if (nfs_write_need_commit(data)) { 1096 if (nfs_write_need_commit(data)) {
1085 memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf)); 1097 memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf));
1086 nfs_mark_request_commit(req); 1098 nfs_mark_request_commit(req);
1087 nfs_end_page_writeback(page);
1088 dprintk(" marked for commit\n"); 1099 dprintk(" marked for commit\n");
1089 goto next; 1100 goto next;
1090 } 1101 }
1091 dprintk(" OK\n"); 1102 dprintk(" OK\n");
1092remove_request: 1103remove_request:
1093 nfs_end_page_writeback(page);
1094 nfs_inode_remove_request(req); 1104 nfs_inode_remove_request(req);
1095 next: 1105 next:
1096 nfs_clear_page_tag_locked(req); 1106 nfs_clear_page_tag_locked(req);
1107 nfs_end_page_writeback(page);
1097 } 1108 }
1098 nfs_writedata_release(calldata); 1109 nfs_writedata_release(calldata);
1099} 1110}
@@ -1190,6 +1201,25 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
1190 1201
1191 1202
1192#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) 1203#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
1204static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait)
1205{
1206 if (!test_and_set_bit(NFS_INO_COMMIT, &nfsi->flags))
1207 return 1;
1208 if (may_wait && !out_of_line_wait_on_bit_lock(&nfsi->flags,
1209 NFS_INO_COMMIT, nfs_wait_bit_killable,
1210 TASK_KILLABLE))
1211 return 1;
1212 return 0;
1213}
1214
1215static void nfs_commit_clear_lock(struct nfs_inode *nfsi)
1216{
1217 clear_bit(NFS_INO_COMMIT, &nfsi->flags);
1218 smp_mb__after_clear_bit();
1219 wake_up_bit(&nfsi->flags, NFS_INO_COMMIT);
1220}
1221
1222
1193static void nfs_commitdata_release(void *data) 1223static void nfs_commitdata_release(void *data)
1194{ 1224{
1195 struct nfs_write_data *wdata = data; 1225 struct nfs_write_data *wdata = data;
@@ -1207,7 +1237,6 @@ static int nfs_commit_rpcsetup(struct list_head *head,
1207{ 1237{
1208 struct nfs_page *first = nfs_list_entry(head->next); 1238 struct nfs_page *first = nfs_list_entry(head->next);
1209 struct inode *inode = first->wb_context->path.dentry->d_inode; 1239 struct inode *inode = first->wb_context->path.dentry->d_inode;
1210 int flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC;
1211 int priority = flush_task_priority(how); 1240 int priority = flush_task_priority(how);
1212 struct rpc_task *task; 1241 struct rpc_task *task;
1213 struct rpc_message msg = { 1242 struct rpc_message msg = {
@@ -1222,7 +1251,7 @@ static int nfs_commit_rpcsetup(struct list_head *head,
1222 .callback_ops = &nfs_commit_ops, 1251 .callback_ops = &nfs_commit_ops,
1223 .callback_data = data, 1252 .callback_data = data,
1224 .workqueue = nfsiod_workqueue, 1253 .workqueue = nfsiod_workqueue,
1225 .flags = flags, 1254 .flags = RPC_TASK_ASYNC,
1226 .priority = priority, 1255 .priority = priority,
1227 }; 1256 };
1228 1257
@@ -1282,6 +1311,7 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how)
1282 BDI_RECLAIMABLE); 1311 BDI_RECLAIMABLE);
1283 nfs_clear_page_tag_locked(req); 1312 nfs_clear_page_tag_locked(req);
1284 } 1313 }
1314 nfs_commit_clear_lock(NFS_I(inode));
1285 return -ENOMEM; 1315 return -ENOMEM;
1286} 1316}
1287 1317
@@ -1337,6 +1367,7 @@ static void nfs_commit_release(void *calldata)
1337 next: 1367 next:
1338 nfs_clear_page_tag_locked(req); 1368 nfs_clear_page_tag_locked(req);
1339 } 1369 }
1370 nfs_commit_clear_lock(NFS_I(data->inode));
1340 nfs_commitdata_release(calldata); 1371 nfs_commitdata_release(calldata);
1341} 1372}
1342 1373
@@ -1351,8 +1382,11 @@ static const struct rpc_call_ops nfs_commit_ops = {
1351static int nfs_commit_inode(struct inode *inode, int how) 1382static int nfs_commit_inode(struct inode *inode, int how)
1352{ 1383{
1353 LIST_HEAD(head); 1384 LIST_HEAD(head);
1354 int res; 1385 int may_wait = how & FLUSH_SYNC;
1386 int res = 0;
1355 1387
1388 if (!nfs_commit_set_lock(NFS_I(inode), may_wait))
1389 goto out_mark_dirty;
1356 spin_lock(&inode->i_lock); 1390 spin_lock(&inode->i_lock);
1357 res = nfs_scan_commit(inode, &head, 0, 0); 1391 res = nfs_scan_commit(inode, &head, 0, 0);
1358 spin_unlock(&inode->i_lock); 1392 spin_unlock(&inode->i_lock);
@@ -1360,7 +1394,22 @@ static int nfs_commit_inode(struct inode *inode, int how)
1360 int error = nfs_commit_list(inode, &head, how); 1394 int error = nfs_commit_list(inode, &head, how);
1361 if (error < 0) 1395 if (error < 0)
1362 return error; 1396 return error;
1363 } 1397 if (may_wait)
1398 wait_on_bit(&NFS_I(inode)->flags, NFS_INO_COMMIT,
1399 nfs_wait_bit_killable,
1400 TASK_KILLABLE);
1401 else
1402 goto out_mark_dirty;
1403 } else
1404 nfs_commit_clear_lock(NFS_I(inode));
1405 return res;
1406 /* Note: If we exit without ensuring that the commit is complete,
1407 * we must mark the inode as dirty. Otherwise, future calls to
1408 * sync_inode() with the WB_SYNC_ALL flag set will fail to ensure
1409 * that the data is on the disk.
1410 */
1411out_mark_dirty:
1412 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
1364 return res; 1413 return res;
1365} 1414}
1366 1415
@@ -1432,6 +1481,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page)
1432 1481
1433 BUG_ON(!PageLocked(page)); 1482 BUG_ON(!PageLocked(page));
1434 for (;;) { 1483 for (;;) {
1484 wait_on_page_writeback(page);
1435 req = nfs_page_find_request(page); 1485 req = nfs_page_find_request(page);
1436 if (req == NULL) 1486 if (req == NULL)
1437 break; 1487 break;
@@ -1466,30 +1516,21 @@ int nfs_wb_page(struct inode *inode, struct page *page)
1466 .range_start = range_start, 1516 .range_start = range_start,
1467 .range_end = range_end, 1517 .range_end = range_end,
1468 }; 1518 };
1469 struct nfs_page *req;
1470 int need_commit;
1471 int ret; 1519 int ret;
1472 1520
1473 while(PagePrivate(page)) { 1521 for (;;) {
1522 wait_on_page_writeback(page);
1474 if (clear_page_dirty_for_io(page)) { 1523 if (clear_page_dirty_for_io(page)) {
1475 ret = nfs_writepage_locked(page, &wbc); 1524 ret = nfs_writepage_locked(page, &wbc);
1476 if (ret < 0) 1525 if (ret < 0)
1477 goto out_error; 1526 goto out_error;
1527 continue;
1478 } 1528 }
1479 req = nfs_find_and_lock_request(page); 1529 if (!PagePrivate(page))
1480 if (!req)
1481 break; 1530 break;
1482 if (IS_ERR(req)) { 1531 ret = nfs_commit_inode(inode, FLUSH_SYNC);
1483 ret = PTR_ERR(req); 1532 if (ret < 0)
1484 goto out_error; 1533 goto out_error;
1485 }
1486 need_commit = test_bit(PG_CLEAN, &req->wb_flags);
1487 nfs_clear_page_tag_locked(req);
1488 if (need_commit) {
1489 ret = nfs_commit_inode(inode, FLUSH_SYNC);
1490 if (ret < 0)
1491 goto out_error;
1492 }
1493 } 1534 }
1494 return 0; 1535 return 0;
1495out_error: 1536out_error:
diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c
index 04133aacb1e5..fc1c52571c03 100644
--- a/fs/nfs_common/nfsacl.c
+++ b/fs/nfs_common/nfsacl.c
@@ -22,6 +22,7 @@
22 22
23#include <linux/module.h> 23#include <linux/module.h>
24#include <linux/fs.h> 24#include <linux/fs.h>
25#include <linux/gfp.h>
25#include <linux/sunrpc/xdr.h> 26#include <linux/sunrpc/xdr.h>
26#include <linux/nfsacl.h> 27#include <linux/nfsacl.h>
27#include <linux/nfs3.h> 28#include <linux/nfs3.h>
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index a0c4016413f1..c2a4f71d87dd 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -12,6 +12,7 @@
12 * Copyright (C) 1995, 1996 Olaf Kirch, <okir@monad.swb.de> 12 * Copyright (C) 1995, 1996 Olaf Kirch, <okir@monad.swb.de>
13 */ 13 */
14 14
15#include <linux/slab.h>
15#include <linux/namei.h> 16#include <linux/namei.h>
16#include <linux/module.h> 17#include <linux/module.h>
17#include <linux/exportfs.h> 18#include <linux/exportfs.h>
@@ -258,10 +259,9 @@ static struct cache_detail svc_expkey_cache = {
258 .alloc = expkey_alloc, 259 .alloc = expkey_alloc,
259}; 260};
260 261
261static struct svc_expkey * 262static int
262svc_expkey_lookup(struct svc_expkey *item) 263svc_expkey_hash(struct svc_expkey *item)
263{ 264{
264 struct cache_head *ch;
265 int hash = item->ek_fsidtype; 265 int hash = item->ek_fsidtype;
266 char * cp = (char*)item->ek_fsid; 266 char * cp = (char*)item->ek_fsid;
267 int len = key_len(item->ek_fsidtype); 267 int len = key_len(item->ek_fsidtype);
@@ -269,6 +269,14 @@ svc_expkey_lookup(struct svc_expkey *item)
269 hash ^= hash_mem(cp, len, EXPKEY_HASHBITS); 269 hash ^= hash_mem(cp, len, EXPKEY_HASHBITS);
270 hash ^= hash_ptr(item->ek_client, EXPKEY_HASHBITS); 270 hash ^= hash_ptr(item->ek_client, EXPKEY_HASHBITS);
271 hash &= EXPKEY_HASHMASK; 271 hash &= EXPKEY_HASHMASK;
272 return hash;
273}
274
275static struct svc_expkey *
276svc_expkey_lookup(struct svc_expkey *item)
277{
278 struct cache_head *ch;
279 int hash = svc_expkey_hash(item);
272 280
273 ch = sunrpc_cache_lookup(&svc_expkey_cache, &item->h, 281 ch = sunrpc_cache_lookup(&svc_expkey_cache, &item->h,
274 hash); 282 hash);
@@ -282,13 +290,7 @@ static struct svc_expkey *
282svc_expkey_update(struct svc_expkey *new, struct svc_expkey *old) 290svc_expkey_update(struct svc_expkey *new, struct svc_expkey *old)
283{ 291{
284 struct cache_head *ch; 292 struct cache_head *ch;
285 int hash = new->ek_fsidtype; 293 int hash = svc_expkey_hash(new);
286 char * cp = (char*)new->ek_fsid;
287 int len = key_len(new->ek_fsidtype);
288
289 hash ^= hash_mem(cp, len, EXPKEY_HASHBITS);
290 hash ^= hash_ptr(new->ek_client, EXPKEY_HASHBITS);
291 hash &= EXPKEY_HASHMASK;
292 294
293 ch = sunrpc_cache_update(&svc_expkey_cache, &new->h, 295 ch = sunrpc_cache_update(&svc_expkey_cache, &new->h,
294 &old->h, hash); 296 &old->h, hash);
@@ -737,14 +739,22 @@ struct cache_detail svc_export_cache = {
737 .alloc = svc_export_alloc, 739 .alloc = svc_export_alloc,
738}; 740};
739 741
740static struct svc_export * 742static int
741svc_export_lookup(struct svc_export *exp) 743svc_export_hash(struct svc_export *exp)
742{ 744{
743 struct cache_head *ch;
744 int hash; 745 int hash;
746
745 hash = hash_ptr(exp->ex_client, EXPORT_HASHBITS); 747 hash = hash_ptr(exp->ex_client, EXPORT_HASHBITS);
746 hash ^= hash_ptr(exp->ex_path.dentry, EXPORT_HASHBITS); 748 hash ^= hash_ptr(exp->ex_path.dentry, EXPORT_HASHBITS);
747 hash ^= hash_ptr(exp->ex_path.mnt, EXPORT_HASHBITS); 749 hash ^= hash_ptr(exp->ex_path.mnt, EXPORT_HASHBITS);
750 return hash;
751}
752
753static struct svc_export *
754svc_export_lookup(struct svc_export *exp)
755{
756 struct cache_head *ch;
757 int hash = svc_export_hash(exp);
748 758
749 ch = sunrpc_cache_lookup(&svc_export_cache, &exp->h, 759 ch = sunrpc_cache_lookup(&svc_export_cache, &exp->h,
750 hash); 760 hash);
@@ -758,10 +768,7 @@ static struct svc_export *
758svc_export_update(struct svc_export *new, struct svc_export *old) 768svc_export_update(struct svc_export *new, struct svc_export *old)
759{ 769{
760 struct cache_head *ch; 770 struct cache_head *ch;
761 int hash; 771 int hash = svc_export_hash(old);
762 hash = hash_ptr(old->ex_client, EXPORT_HASHBITS);
763 hash ^= hash_ptr(old->ex_path.dentry, EXPORT_HASHBITS);
764 hash ^= hash_ptr(old->ex_path.mnt, EXPORT_HASHBITS);
765 772
766 ch = sunrpc_cache_update(&svc_export_cache, &new->h, 773 ch = sunrpc_cache_update(&svc_export_cache, &new->h,
767 &old->h, 774 &old->h,
@@ -1070,9 +1077,9 @@ exp_export(struct nfsctl_export *nxp)
1070 err = 0; 1077 err = 0;
1071finish: 1078finish:
1072 kfree(new.ex_pathname); 1079 kfree(new.ex_pathname);
1073 if (exp) 1080 if (!IS_ERR_OR_NULL(exp))
1074 exp_put(exp); 1081 exp_put(exp);
1075 if (fsid_key && !IS_ERR(fsid_key)) 1082 if (!IS_ERR_OR_NULL(fsid_key))
1076 cache_put(&fsid_key->h, &svc_expkey_cache); 1083 cache_put(&fsid_key->h, &svc_expkey_cache);
1077 path_put(&path); 1084 path_put(&path);
1078out_put_clp: 1085out_put_clp:
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index f20589d2ae27..6aa5590c3679 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -7,6 +7,7 @@
7#include "nfsd.h" 7#include "nfsd.h"
8/* FIXME: nfsacl.h is a broken header */ 8/* FIXME: nfsacl.h is a broken header */
9#include <linux/nfsacl.h> 9#include <linux/nfsacl.h>
10#include <linux/gfp.h>
10#include "cache.h" 11#include "cache.h"
11#include "xdr3.h" 12#include "xdr3.h"
12#include "vfs.h" 13#include "vfs.h"
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index e0c4846bad92..a596e9d987e4 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -7,6 +7,7 @@
7#include "nfsd.h" 7#include "nfsd.h"
8/* FIXME: nfsacl.h is a broken header */ 8/* FIXME: nfsacl.h is a broken header */
9#include <linux/nfsacl.h> 9#include <linux/nfsacl.h>
10#include <linux/gfp.h>
10#include "cache.h" 11#include "cache.h"
11#include "xdr3.h" 12#include "xdr3.h"
12#include "vfs.h" 13#include "vfs.h"
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index 88150685df34..e48052615159 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -34,6 +34,7 @@
34 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 34 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35 */ 35 */
36 36
37#include <linux/slab.h>
37#include <linux/nfs_fs.h> 38#include <linux/nfs_fs.h>
38#include <linux/nfs4_acl.h> 39#include <linux/nfs4_acl.h>
39 40
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 4bc22c763de7..eb78e7e22077 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -32,6 +32,8 @@
32 */ 32 */
33 33
34#include <linux/sunrpc/clnt.h> 34#include <linux/sunrpc/clnt.h>
35#include <linux/sunrpc/svc_xprt.h>
36#include <linux/slab.h>
35#include "nfsd.h" 37#include "nfsd.h"
36#include "state.h" 38#include "state.h"
37 39
@@ -78,11 +80,6 @@ enum nfs_cb_opnum4 {
78 cb_sequence_dec_sz + \ 80 cb_sequence_dec_sz + \
79 op_dec_sz) 81 op_dec_sz)
80 82
81struct nfs4_rpc_args {
82 void *args_op;
83 struct nfsd4_cb_sequence args_seq;
84};
85
86/* 83/*
87* Generic encode routines from fs/nfs/nfs4xdr.c 84* Generic encode routines from fs/nfs/nfs4xdr.c
88*/ 85*/
@@ -427,13 +424,19 @@ static struct rpc_procinfo nfs4_cb_procedures[] = {
427}; 424};
428 425
429static struct rpc_version nfs_cb_version4 = { 426static struct rpc_version nfs_cb_version4 = {
427/*
428 * Note on the callback rpc program version number: despite language in rfc
429 * 5661 section 18.36.3 requiring servers to use 4 in this field, the
430 * official xdr descriptions for both 4.0 and 4.1 specify version 1, and
431 * in practice that appears to be what implementations use. The section
432 * 18.36.3 language is expected to be fixed in an erratum.
433 */
430 .number = 1, 434 .number = 1,
431 .nrprocs = ARRAY_SIZE(nfs4_cb_procedures), 435 .nrprocs = ARRAY_SIZE(nfs4_cb_procedures),
432 .procs = nfs4_cb_procedures 436 .procs = nfs4_cb_procedures
433}; 437};
434 438
435static struct rpc_version * nfs_cb_version[] = { 439static struct rpc_version * nfs_cb_version[] = {
436 NULL,
437 &nfs_cb_version4, 440 &nfs_cb_version4,
438}; 441};
439 442
@@ -455,15 +458,14 @@ static struct rpc_program cb_program = {
455 458
456static int max_cb_time(void) 459static int max_cb_time(void)
457{ 460{
458 return max(NFSD_LEASE_TIME/10, (time_t)1) * HZ; 461 return max(nfsd4_lease/10, (time_t)1) * HZ;
459} 462}
460 463
461/* Reference counting, callback cleanup, etc., all look racy as heck. 464/* Reference counting, callback cleanup, etc., all look racy as heck.
462 * And why is cb_set an atomic? */ 465 * And why is cl_cb_set an atomic? */
463 466
464int setup_callback_client(struct nfs4_client *clp) 467int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *cb)
465{ 468{
466 struct nfs4_cb_conn *cb = &clp->cl_cb_conn;
467 struct rpc_timeout timeparms = { 469 struct rpc_timeout timeparms = {
468 .to_initval = max_cb_time(), 470 .to_initval = max_cb_time(),
469 .to_retries = 0, 471 .to_retries = 0,
@@ -475,7 +477,7 @@ int setup_callback_client(struct nfs4_client *clp)
475 .timeout = &timeparms, 477 .timeout = &timeparms,
476 .program = &cb_program, 478 .program = &cb_program,
477 .prognumber = cb->cb_prog, 479 .prognumber = cb->cb_prog,
478 .version = nfs_cb_version[1]->number, 480 .version = 0,
479 .authflavor = clp->cl_flavor, 481 .authflavor = clp->cl_flavor,
480 .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET), 482 .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),
481 .client_name = clp->cl_principal, 483 .client_name = clp->cl_principal,
@@ -485,7 +487,7 @@ int setup_callback_client(struct nfs4_client *clp)
485 if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5)) 487 if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5))
486 return -EINVAL; 488 return -EINVAL;
487 if (cb->cb_minorversion) { 489 if (cb->cb_minorversion) {
488 args.bc_xprt = clp->cl_cb_xprt; 490 args.bc_xprt = cb->cb_xprt;
489 args.protocol = XPRT_TRANSPORT_BC_TCP; 491 args.protocol = XPRT_TRANSPORT_BC_TCP;
490 } 492 }
491 /* Create RPC client */ 493 /* Create RPC client */
@@ -495,7 +497,7 @@ int setup_callback_client(struct nfs4_client *clp)
495 PTR_ERR(client)); 497 PTR_ERR(client));
496 return PTR_ERR(client); 498 return PTR_ERR(client);
497 } 499 }
498 cb->cb_client = client; 500 nfsd4_set_callback_client(clp, client);
499 return 0; 501 return 0;
500 502
501} 503}
@@ -513,8 +515,7 @@ static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata)
513 if (task->tk_status) 515 if (task->tk_status)
514 warn_no_callback_path(clp, task->tk_status); 516 warn_no_callback_path(clp, task->tk_status);
515 else 517 else
516 atomic_set(&clp->cl_cb_conn.cb_set, 1); 518 atomic_set(&clp->cl_cb_set, 1);
517 put_nfs4_client(clp);
518} 519}
519 520
520static const struct rpc_call_ops nfsd4_cb_probe_ops = { 521static const struct rpc_call_ops nfsd4_cb_probe_ops = {
@@ -536,7 +537,6 @@ int set_callback_cred(void)
536 537
537void do_probe_callback(struct nfs4_client *clp) 538void do_probe_callback(struct nfs4_client *clp)
538{ 539{
539 struct nfs4_cb_conn *cb = &clp->cl_cb_conn;
540 struct rpc_message msg = { 540 struct rpc_message msg = {
541 .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL], 541 .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
542 .rpc_argp = clp, 542 .rpc_argp = clp,
@@ -544,34 +544,27 @@ void do_probe_callback(struct nfs4_client *clp)
544 }; 544 };
545 int status; 545 int status;
546 546
547 status = rpc_call_async(cb->cb_client, &msg, 547 status = rpc_call_async(clp->cl_cb_client, &msg,
548 RPC_TASK_SOFT | RPC_TASK_SOFTCONN, 548 RPC_TASK_SOFT | RPC_TASK_SOFTCONN,
549 &nfsd4_cb_probe_ops, (void *)clp); 549 &nfsd4_cb_probe_ops, (void *)clp);
550 if (status) { 550 if (status)
551 warn_no_callback_path(clp, status); 551 warn_no_callback_path(clp, status);
552 put_nfs4_client(clp);
553 }
554} 552}
555 553
556/* 554/*
557 * Set up the callback client and put a NFSPROC4_CB_NULL on the wire... 555 * Set up the callback client and put a NFSPROC4_CB_NULL on the wire...
558 */ 556 */
559void 557void nfsd4_probe_callback(struct nfs4_client *clp, struct nfs4_cb_conn *cb)
560nfsd4_probe_callback(struct nfs4_client *clp)
561{ 558{
562 int status; 559 int status;
563 560
564 BUG_ON(atomic_read(&clp->cl_cb_conn.cb_set)); 561 BUG_ON(atomic_read(&clp->cl_cb_set));
565 562
566 status = setup_callback_client(clp); 563 status = setup_callback_client(clp, cb);
567 if (status) { 564 if (status) {
568 warn_no_callback_path(clp, status); 565 warn_no_callback_path(clp, status);
569 return; 566 return;
570 } 567 }
571
572 /* the task holds a reference to the nfs4_client struct */
573 atomic_inc(&clp->cl_count);
574
575 do_probe_callback(clp); 568 do_probe_callback(clp);
576} 569}
577 570
@@ -657,18 +650,32 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
657 } 650 }
658} 651}
659 652
653
660static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata) 654static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
661{ 655{
662 struct nfs4_delegation *dp = calldata; 656 struct nfs4_delegation *dp = calldata;
663 struct nfs4_client *clp = dp->dl_client; 657 struct nfs4_client *clp = dp->dl_client;
658 struct rpc_clnt *current_rpc_client = clp->cl_cb_client;
664 659
665 nfsd4_cb_done(task, calldata); 660 nfsd4_cb_done(task, calldata);
666 661
662 if (current_rpc_client == NULL) {
663 /* We're shutting down; give up. */
664 /* XXX: err, or is it ok just to fall through
665 * and rpc_restart_call? */
666 return;
667 }
668
667 switch (task->tk_status) { 669 switch (task->tk_status) {
668 case -EIO: 670 case -EIO:
669 /* Network partition? */ 671 /* Network partition? */
670 atomic_set(&clp->cl_cb_conn.cb_set, 0); 672 atomic_set(&clp->cl_cb_set, 0);
671 warn_no_callback_path(clp, task->tk_status); 673 warn_no_callback_path(clp, task->tk_status);
674 if (current_rpc_client != task->tk_client) {
675 /* queue a callback on the new connection: */
676 nfsd4_cb_recall(dp);
677 return;
678 }
672 case -EBADHANDLE: 679 case -EBADHANDLE:
673 case -NFS4ERR_BAD_STATEID: 680 case -NFS4ERR_BAD_STATEID:
674 /* Race: client probably got cb_recall 681 /* Race: client probably got cb_recall
@@ -676,7 +683,7 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
676 break; 683 break;
677 default: 684 default:
678 /* success, or error we can't handle */ 685 /* success, or error we can't handle */
679 goto done; 686 return;
680 } 687 }
681 if (dp->dl_retries--) { 688 if (dp->dl_retries--) {
682 rpc_delay(task, 2*HZ); 689 rpc_delay(task, 2*HZ);
@@ -684,20 +691,16 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
684 rpc_restart_call(task); 691 rpc_restart_call(task);
685 return; 692 return;
686 } else { 693 } else {
687 atomic_set(&clp->cl_cb_conn.cb_set, 0); 694 atomic_set(&clp->cl_cb_set, 0);
688 warn_no_callback_path(clp, task->tk_status); 695 warn_no_callback_path(clp, task->tk_status);
689 } 696 }
690done:
691 kfree(task->tk_msg.rpc_argp);
692} 697}
693 698
694static void nfsd4_cb_recall_release(void *calldata) 699static void nfsd4_cb_recall_release(void *calldata)
695{ 700{
696 struct nfs4_delegation *dp = calldata; 701 struct nfs4_delegation *dp = calldata;
697 struct nfs4_client *clp = dp->dl_client;
698 702
699 nfs4_put_delegation(dp); 703 nfs4_put_delegation(dp);
700 put_nfs4_client(clp);
701} 704}
702 705
703static const struct rpc_call_ops nfsd4_cb_recall_ops = { 706static const struct rpc_call_ops nfsd4_cb_recall_ops = {
@@ -706,33 +709,75 @@ static const struct rpc_call_ops nfsd4_cb_recall_ops = {
706 .rpc_release = nfsd4_cb_recall_release, 709 .rpc_release = nfsd4_cb_recall_release,
707}; 710};
708 711
712static struct workqueue_struct *callback_wq;
713
714int nfsd4_create_callback_queue(void)
715{
716 callback_wq = create_singlethread_workqueue("nfsd4_callbacks");
717 if (!callback_wq)
718 return -ENOMEM;
719 return 0;
720}
721
722void nfsd4_destroy_callback_queue(void)
723{
724 destroy_workqueue(callback_wq);
725}
726
727/* must be called under the state lock */
728void nfsd4_set_callback_client(struct nfs4_client *clp, struct rpc_clnt *new)
729{
730 struct rpc_clnt *old = clp->cl_cb_client;
731
732 clp->cl_cb_client = new;
733 /*
734 * After this, any work that saw the old value of cl_cb_client will
735 * be gone:
736 */
737 flush_workqueue(callback_wq);
738 /* So we can safely shut it down: */
739 if (old)
740 rpc_shutdown_client(old);
741}
742
709/* 743/*
710 * called with dp->dl_count inc'ed. 744 * called with dp->dl_count inc'ed.
711 */ 745 */
712void 746static void _nfsd4_cb_recall(struct nfs4_delegation *dp)
713nfsd4_cb_recall(struct nfs4_delegation *dp)
714{ 747{
715 struct nfs4_client *clp = dp->dl_client; 748 struct nfs4_client *clp = dp->dl_client;
716 struct rpc_clnt *clnt = clp->cl_cb_conn.cb_client; 749 struct rpc_clnt *clnt = clp->cl_cb_client;
717 struct nfs4_rpc_args *args; 750 struct nfs4_rpc_args *args = &dp->dl_recall.cb_args;
718 struct rpc_message msg = { 751 struct rpc_message msg = {
719 .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL], 752 .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL],
720 .rpc_cred = callback_cred 753 .rpc_cred = callback_cred
721 }; 754 };
722 int status = -ENOMEM; 755 int status;
756
757 if (clnt == NULL)
758 return; /* Client is shutting down; give up. */
723 759
724 args = kzalloc(sizeof(*args), GFP_KERNEL);
725 if (!args)
726 goto out;
727 args->args_op = dp; 760 args->args_op = dp;
728 msg.rpc_argp = args; 761 msg.rpc_argp = args;
729 dp->dl_retries = 1; 762 dp->dl_retries = 1;
730 status = rpc_call_async(clnt, &msg, RPC_TASK_SOFT, 763 status = rpc_call_async(clnt, &msg, RPC_TASK_SOFT,
731 &nfsd4_cb_recall_ops, dp); 764 &nfsd4_cb_recall_ops, dp);
732out: 765 if (status)
733 if (status) {
734 kfree(args);
735 put_nfs4_client(clp);
736 nfs4_put_delegation(dp); 766 nfs4_put_delegation(dp);
737 } 767}
768
769void nfsd4_do_callback_rpc(struct work_struct *w)
770{
771 /* XXX: for now, just send off delegation recall. */
772 /* In future, generalize to handle any sort of callback. */
773 struct nfsd4_callback *c = container_of(w, struct nfsd4_callback, cb_work);
774 struct nfs4_delegation *dp = container_of(c, struct nfs4_delegation, dl_recall);
775
776 _nfsd4_cb_recall(dp);
777}
778
779
780void nfsd4_cb_recall(struct nfs4_delegation *dp)
781{
782 queue_work(callback_wq, &dp->dl_recall.cb_work);
738} 783}
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index 6e2983b27f3c..c78dbf493424 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -36,6 +36,7 @@
36#include <linux/nfsd_idmap.h> 36#include <linux/nfsd_idmap.h>
37#include <linux/seq_file.h> 37#include <linux/seq_file.h>
38#include <linux/sched.h> 38#include <linux/sched.h>
39#include <linux/slab.h>
39 40
40/* 41/*
41 * Cache entry 42 * Cache entry
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 37514c469846..59ec449b0c7f 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -33,6 +33,7 @@
33 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 */ 34 */
35#include <linux/file.h> 35#include <linux/file.h>
36#include <linux/slab.h>
36 37
37#include "cache.h" 38#include "cache.h"
38#include "xdr4.h" 39#include "xdr4.h"
@@ -968,20 +969,36 @@ static struct nfsd4_operation nfsd4_ops[];
968static const char *nfsd4_op_name(unsigned opnum); 969static const char *nfsd4_op_name(unsigned opnum);
969 970
970/* 971/*
971 * Enforce NFSv4.1 COMPOUND ordering rules. 972 * Enforce NFSv4.1 COMPOUND ordering rules:
972 * 973 *
973 * TODO: 974 * Also note, enforced elsewhere:
974 * - enforce NFS4ERR_NOT_ONLY_OP, 975 * - SEQUENCE other than as first op results in
975 * - DESTROY_SESSION MUST be the final operation in the COMPOUND request. 976 * NFS4ERR_SEQUENCE_POS. (Enforced in nfsd4_sequence().)
977 * - BIND_CONN_TO_SESSION must be the only op in its compound
978 * (Will be enforced in nfsd4_bind_conn_to_session().)
979 * - DESTROY_SESSION must be the final operation in a compound, if
980 * sessionid's in SEQUENCE and DESTROY_SESSION are the same.
981 * (Enforced in nfsd4_destroy_session().)
976 */ 982 */
977static bool nfs41_op_ordering_ok(struct nfsd4_compoundargs *args) 983static __be32 nfs41_check_op_ordering(struct nfsd4_compoundargs *args)
978{ 984{
979 if (args->minorversion && args->opcnt > 0) { 985 struct nfsd4_op *op = &args->ops[0];
980 struct nfsd4_op *op = &args->ops[0]; 986
981 return (op->status == nfserr_op_illegal) || 987 /* These ordering requirements don't apply to NFSv4.0: */
982 (nfsd4_ops[op->opnum].op_flags & ALLOWED_AS_FIRST_OP); 988 if (args->minorversion == 0)
983 } 989 return nfs_ok;
984 return true; 990 /* This is weird, but OK, not our problem: */
991 if (args->opcnt == 0)
992 return nfs_ok;
993 if (op->status == nfserr_op_illegal)
994 return nfs_ok;
995 if (!(nfsd4_ops[op->opnum].op_flags & ALLOWED_AS_FIRST_OP))
996 return nfserr_op_not_in_session;
997 if (op->opnum == OP_SEQUENCE)
998 return nfs_ok;
999 if (args->opcnt != 1)
1000 return nfserr_not_only_op;
1001 return nfs_ok;
985} 1002}
986 1003
987/* 1004/*
@@ -1011,6 +1028,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
1011 resp->rqstp = rqstp; 1028 resp->rqstp = rqstp;
1012 resp->cstate.minorversion = args->minorversion; 1029 resp->cstate.minorversion = args->minorversion;
1013 resp->cstate.replay_owner = NULL; 1030 resp->cstate.replay_owner = NULL;
1031 resp->cstate.session = NULL;
1014 fh_init(&resp->cstate.current_fh, NFS4_FHSIZE); 1032 fh_init(&resp->cstate.current_fh, NFS4_FHSIZE);
1015 fh_init(&resp->cstate.save_fh, NFS4_FHSIZE); 1033 fh_init(&resp->cstate.save_fh, NFS4_FHSIZE);
1016 /* Use the deferral mechanism only for NFSv4.0 compounds */ 1034 /* Use the deferral mechanism only for NFSv4.0 compounds */
@@ -1023,13 +1041,13 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
1023 if (args->minorversion > nfsd_supported_minorversion) 1041 if (args->minorversion > nfsd_supported_minorversion)
1024 goto out; 1042 goto out;
1025 1043
1026 if (!nfs41_op_ordering_ok(args)) { 1044 status = nfs41_check_op_ordering(args);
1045 if (status) {
1027 op = &args->ops[0]; 1046 op = &args->ops[0];
1028 op->status = nfserr_sequence_pos; 1047 op->status = status;
1029 goto encode_op; 1048 goto encode_op;
1030 } 1049 }
1031 1050
1032 status = nfs_ok;
1033 while (!status && resp->opcnt < args->opcnt) { 1051 while (!status && resp->opcnt < args->opcnt) {
1034 op = &args->ops[resp->opcnt++]; 1052 op = &args->ops[resp->opcnt++];
1035 1053
@@ -1294,6 +1312,11 @@ static struct nfsd4_operation nfsd4_ops[] = {
1294 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, 1312 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
1295 .op_name = "OP_SEQUENCE", 1313 .op_name = "OP_SEQUENCE",
1296 }, 1314 },
1315 [OP_RECLAIM_COMPLETE] = {
1316 .op_func = (nfsd4op_func)nfsd4_reclaim_complete,
1317 .op_flags = ALLOWED_WITHOUT_FH,
1318 .op_name = "OP_RECLAIM_COMPLETE",
1319 },
1297}; 1320};
1298 1321
1299static const char *nfsd4_op_name(unsigned opnum) 1322static const char *nfsd4_op_name(unsigned opnum)
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 98fb98e330b4..7e26caab2a26 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -32,6 +32,7 @@
32*/ 32*/
33 33
34#include <linux/file.h> 34#include <linux/file.h>
35#include <linux/slab.h>
35#include <linux/namei.h> 36#include <linux/namei.h>
36#include <linux/crypto.h> 37#include <linux/crypto.h>
37#include <linux/sched.h> 38#include <linux/sched.h>
@@ -43,8 +44,7 @@
43#define NFSDDBG_FACILITY NFSDDBG_PROC 44#define NFSDDBG_FACILITY NFSDDBG_PROC
44 45
45/* Globals */ 46/* Globals */
46static struct path rec_dir; 47static struct file *rec_file;
47static int rec_dir_init = 0;
48 48
49static int 49static int
50nfs4_save_creds(const struct cred **original_creds) 50nfs4_save_creds(const struct cred **original_creds)
@@ -116,33 +116,28 @@ out_no_tfm:
116 return status; 116 return status;
117} 117}
118 118
119static void
120nfsd4_sync_rec_dir(void)
121{
122 vfs_fsync(NULL, rec_dir.dentry, 0);
123}
124
125int 119int
126nfsd4_create_clid_dir(struct nfs4_client *clp) 120nfsd4_create_clid_dir(struct nfs4_client *clp)
127{ 121{
128 const struct cred *original_cred; 122 const struct cred *original_cred;
129 char *dname = clp->cl_recdir; 123 char *dname = clp->cl_recdir;
130 struct dentry *dentry; 124 struct dentry *dir, *dentry;
131 int status; 125 int status;
132 126
133 dprintk("NFSD: nfsd4_create_clid_dir for \"%s\"\n", dname); 127 dprintk("NFSD: nfsd4_create_clid_dir for \"%s\"\n", dname);
134 128
135 if (!rec_dir_init || clp->cl_firststate) 129 if (!rec_file || clp->cl_firststate)
136 return 0; 130 return 0;
137 131
138 status = nfs4_save_creds(&original_cred); 132 status = nfs4_save_creds(&original_cred);
139 if (status < 0) 133 if (status < 0)
140 return status; 134 return status;
141 135
136 dir = rec_file->f_path.dentry;
142 /* lock the parent */ 137 /* lock the parent */
143 mutex_lock(&rec_dir.dentry->d_inode->i_mutex); 138 mutex_lock(&dir->d_inode->i_mutex);
144 139
145 dentry = lookup_one_len(dname, rec_dir.dentry, HEXDIR_LEN-1); 140 dentry = lookup_one_len(dname, dir, HEXDIR_LEN-1);
146 if (IS_ERR(dentry)) { 141 if (IS_ERR(dentry)) {
147 status = PTR_ERR(dentry); 142 status = PTR_ERR(dentry);
148 goto out_unlock; 143 goto out_unlock;
@@ -152,18 +147,18 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
152 dprintk("NFSD: nfsd4_create_clid_dir: DIRECTORY EXISTS\n"); 147 dprintk("NFSD: nfsd4_create_clid_dir: DIRECTORY EXISTS\n");
153 goto out_put; 148 goto out_put;
154 } 149 }
155 status = mnt_want_write(rec_dir.mnt); 150 status = mnt_want_write(rec_file->f_path.mnt);
156 if (status) 151 if (status)
157 goto out_put; 152 goto out_put;
158 status = vfs_mkdir(rec_dir.dentry->d_inode, dentry, S_IRWXU); 153 status = vfs_mkdir(dir->d_inode, dentry, S_IRWXU);
159 mnt_drop_write(rec_dir.mnt); 154 mnt_drop_write(rec_file->f_path.mnt);
160out_put: 155out_put:
161 dput(dentry); 156 dput(dentry);
162out_unlock: 157out_unlock:
163 mutex_unlock(&rec_dir.dentry->d_inode->i_mutex); 158 mutex_unlock(&dir->d_inode->i_mutex);
164 if (status == 0) { 159 if (status == 0) {
165 clp->cl_firststate = 1; 160 clp->cl_firststate = 1;
166 nfsd4_sync_rec_dir(); 161 vfs_fsync(rec_file, 0);
167 } 162 }
168 nfs4_reset_creds(original_cred); 163 nfs4_reset_creds(original_cred);
169 dprintk("NFSD: nfsd4_create_clid_dir returns %d\n", status); 164 dprintk("NFSD: nfsd4_create_clid_dir returns %d\n", status);
@@ -205,14 +200,14 @@ nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f)
205 struct dentry *dentry; 200 struct dentry *dentry;
206 int status; 201 int status;
207 202
208 if (!rec_dir_init) 203 if (!rec_file)
209 return 0; 204 return 0;
210 205
211 status = nfs4_save_creds(&original_cred); 206 status = nfs4_save_creds(&original_cred);
212 if (status < 0) 207 if (status < 0)
213 return status; 208 return status;
214 209
215 filp = dentry_open(dget(dir), mntget(rec_dir.mnt), O_RDONLY, 210 filp = dentry_open(dget(dir), mntget(rec_file->f_path.mnt), O_RDONLY,
216 current_cred()); 211 current_cred());
217 status = PTR_ERR(filp); 212 status = PTR_ERR(filp);
218 if (IS_ERR(filp)) 213 if (IS_ERR(filp))
@@ -249,13 +244,14 @@ out:
249static int 244static int
250nfsd4_unlink_clid_dir(char *name, int namlen) 245nfsd4_unlink_clid_dir(char *name, int namlen)
251{ 246{
252 struct dentry *dentry; 247 struct dentry *dir, *dentry;
253 int status; 248 int status;
254 249
255 dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name); 250 dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name);
256 251
257 mutex_lock_nested(&rec_dir.dentry->d_inode->i_mutex, I_MUTEX_PARENT); 252 dir = rec_file->f_path.dentry;
258 dentry = lookup_one_len(name, rec_dir.dentry, namlen); 253 mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
254 dentry = lookup_one_len(name, dir, namlen);
259 if (IS_ERR(dentry)) { 255 if (IS_ERR(dentry)) {
260 status = PTR_ERR(dentry); 256 status = PTR_ERR(dentry);
261 goto out_unlock; 257 goto out_unlock;
@@ -263,11 +259,11 @@ nfsd4_unlink_clid_dir(char *name, int namlen)
263 status = -ENOENT; 259 status = -ENOENT;
264 if (!dentry->d_inode) 260 if (!dentry->d_inode)
265 goto out; 261 goto out;
266 status = vfs_rmdir(rec_dir.dentry->d_inode, dentry); 262 status = vfs_rmdir(dir->d_inode, dentry);
267out: 263out:
268 dput(dentry); 264 dput(dentry);
269out_unlock: 265out_unlock:
270 mutex_unlock(&rec_dir.dentry->d_inode->i_mutex); 266 mutex_unlock(&dir->d_inode->i_mutex);
271 return status; 267 return status;
272} 268}
273 269
@@ -277,10 +273,10 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp)
277 const struct cred *original_cred; 273 const struct cred *original_cred;
278 int status; 274 int status;
279 275
280 if (!rec_dir_init || !clp->cl_firststate) 276 if (!rec_file || !clp->cl_firststate)
281 return; 277 return;
282 278
283 status = mnt_want_write(rec_dir.mnt); 279 status = mnt_want_write(rec_file->f_path.mnt);
284 if (status) 280 if (status)
285 goto out; 281 goto out;
286 clp->cl_firststate = 0; 282 clp->cl_firststate = 0;
@@ -292,8 +288,8 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp)
292 status = nfsd4_unlink_clid_dir(clp->cl_recdir, HEXDIR_LEN-1); 288 status = nfsd4_unlink_clid_dir(clp->cl_recdir, HEXDIR_LEN-1);
293 nfs4_reset_creds(original_cred); 289 nfs4_reset_creds(original_cred);
294 if (status == 0) 290 if (status == 0)
295 nfsd4_sync_rec_dir(); 291 vfs_fsync(rec_file, 0);
296 mnt_drop_write(rec_dir.mnt); 292 mnt_drop_write(rec_file->f_path.mnt);
297out: 293out:
298 if (status) 294 if (status)
299 printk("NFSD: Failed to remove expired client state directory" 295 printk("NFSD: Failed to remove expired client state directory"
@@ -322,19 +318,19 @@ void
322nfsd4_recdir_purge_old(void) { 318nfsd4_recdir_purge_old(void) {
323 int status; 319 int status;
324 320
325 if (!rec_dir_init) 321 if (!rec_file)
326 return; 322 return;
327 status = mnt_want_write(rec_dir.mnt); 323 status = mnt_want_write(rec_file->f_path.mnt);
328 if (status) 324 if (status)
329 goto out; 325 goto out;
330 status = nfsd4_list_rec_dir(rec_dir.dentry, purge_old); 326 status = nfsd4_list_rec_dir(rec_file->f_path.dentry, purge_old);
331 if (status == 0) 327 if (status == 0)
332 nfsd4_sync_rec_dir(); 328 vfs_fsync(rec_file, 0);
333 mnt_drop_write(rec_dir.mnt); 329 mnt_drop_write(rec_file->f_path.mnt);
334out: 330out:
335 if (status) 331 if (status)
336 printk("nfsd4: failed to purge old clients from recovery" 332 printk("nfsd4: failed to purge old clients from recovery"
337 " directory %s\n", rec_dir.dentry->d_name.name); 333 " directory %s\n", rec_file->f_path.dentry->d_name.name);
338} 334}
339 335
340static int 336static int
@@ -354,10 +350,13 @@ int
354nfsd4_recdir_load(void) { 350nfsd4_recdir_load(void) {
355 int status; 351 int status;
356 352
357 status = nfsd4_list_rec_dir(rec_dir.dentry, load_recdir); 353 if (!rec_file)
354 return 0;
355
356 status = nfsd4_list_rec_dir(rec_file->f_path.dentry, load_recdir);
358 if (status) 357 if (status)
359 printk("nfsd4: failed loading clients from recovery" 358 printk("nfsd4: failed loading clients from recovery"
360 " directory %s\n", rec_dir.dentry->d_name.name); 359 " directory %s\n", rec_file->f_path.dentry->d_name.name);
361 return status; 360 return status;
362} 361}
363 362
@@ -374,7 +373,7 @@ nfsd4_init_recdir(char *rec_dirname)
374 printk("NFSD: Using %s as the NFSv4 state recovery directory\n", 373 printk("NFSD: Using %s as the NFSv4 state recovery directory\n",
375 rec_dirname); 374 rec_dirname);
376 375
377 BUG_ON(rec_dir_init); 376 BUG_ON(rec_file);
378 377
379 status = nfs4_save_creds(&original_cred); 378 status = nfs4_save_creds(&original_cred);
380 if (status < 0) { 379 if (status < 0) {
@@ -384,22 +383,21 @@ nfsd4_init_recdir(char *rec_dirname)
384 return; 383 return;
385 } 384 }
386 385
387 status = kern_path(rec_dirname, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, 386 rec_file = filp_open(rec_dirname, O_RDONLY | O_DIRECTORY, 0);
388 &rec_dir); 387 if (IS_ERR(rec_file)) {
389 if (status)
390 printk("NFSD: unable to find recovery directory %s\n", 388 printk("NFSD: unable to find recovery directory %s\n",
391 rec_dirname); 389 rec_dirname);
390 rec_file = NULL;
391 }
392 392
393 if (!status)
394 rec_dir_init = 1;
395 nfs4_reset_creds(original_cred); 393 nfs4_reset_creds(original_cred);
396} 394}
397 395
398void 396void
399nfsd4_shutdown_recdir(void) 397nfsd4_shutdown_recdir(void)
400{ 398{
401 if (!rec_dir_init) 399 if (!rec_file)
402 return; 400 return;
403 rec_dir_init = 0; 401 fput(rec_file);
404 path_put(&rec_dir); 402 rec_file = NULL;
405} 403}
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index c97fddbd17db..12f7109720c2 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -34,6 +34,7 @@
34 34
35#include <linux/file.h> 35#include <linux/file.h>
36#include <linux/smp_lock.h> 36#include <linux/smp_lock.h>
37#include <linux/slab.h>
37#include <linux/namei.h> 38#include <linux/namei.h>
38#include <linux/swap.h> 39#include <linux/swap.h>
39#include <linux/sunrpc/svcauth_gss.h> 40#include <linux/sunrpc/svcauth_gss.h>
@@ -44,8 +45,8 @@
44#define NFSDDBG_FACILITY NFSDDBG_PROC 45#define NFSDDBG_FACILITY NFSDDBG_PROC
45 46
46/* Globals */ 47/* Globals */
47static time_t lease_time = 90; /* default lease time */ 48time_t nfsd4_lease = 90; /* default lease time */
48static time_t user_lease_time = 90; 49time_t nfsd4_grace = 90;
49static time_t boot_time; 50static time_t boot_time;
50static u32 current_ownerid = 1; 51static u32 current_ownerid = 1;
51static u32 current_fileid = 1; 52static u32 current_fileid = 1;
@@ -189,7 +190,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
189 dp->dl_vfs_file = stp->st_vfs_file; 190 dp->dl_vfs_file = stp->st_vfs_file;
190 dp->dl_type = type; 191 dp->dl_type = type;
191 dp->dl_ident = cb->cb_ident; 192 dp->dl_ident = cb->cb_ident;
192 dp->dl_stateid.si_boot = get_seconds(); 193 dp->dl_stateid.si_boot = boot_time;
193 dp->dl_stateid.si_stateownerid = current_delegid++; 194 dp->dl_stateid.si_stateownerid = current_delegid++;
194 dp->dl_stateid.si_fileid = 0; 195 dp->dl_stateid.si_fileid = 0;
195 dp->dl_stateid.si_generation = 0; 196 dp->dl_stateid.si_generation = 0;
@@ -198,6 +199,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
198 atomic_set(&dp->dl_count, 1); 199 atomic_set(&dp->dl_count, 1);
199 list_add(&dp->dl_perfile, &fp->fi_delegations); 200 list_add(&dp->dl_perfile, &fp->fi_delegations);
200 list_add(&dp->dl_perclnt, &clp->cl_delegations); 201 list_add(&dp->dl_perclnt, &clp->cl_delegations);
202 INIT_WORK(&dp->dl_recall.cb_work, nfsd4_do_callback_rpc);
201 return dp; 203 return dp;
202} 204}
203 205
@@ -248,6 +250,9 @@ unhash_delegation(struct nfs4_delegation *dp)
248 * SETCLIENTID state 250 * SETCLIENTID state
249 */ 251 */
250 252
253/* client_lock protects the client lru list and session hash table */
254static DEFINE_SPINLOCK(client_lock);
255
251/* Hash tables for nfs4_clientid state */ 256/* Hash tables for nfs4_clientid state */
252#define CLIENT_HASH_BITS 4 257#define CLIENT_HASH_BITS 4
253#define CLIENT_HASH_SIZE (1 << CLIENT_HASH_BITS) 258#define CLIENT_HASH_SIZE (1 << CLIENT_HASH_BITS)
@@ -366,7 +371,6 @@ static void release_openowner(struct nfs4_stateowner *sop)
366 nfs4_put_stateowner(sop); 371 nfs4_put_stateowner(sop);
367} 372}
368 373
369static DEFINE_SPINLOCK(sessionid_lock);
370#define SESSION_HASH_SIZE 512 374#define SESSION_HASH_SIZE 512
371static struct list_head sessionid_hashtbl[SESSION_HASH_SIZE]; 375static struct list_head sessionid_hashtbl[SESSION_HASH_SIZE];
372 376
@@ -564,10 +568,10 @@ alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp,
564 568
565 new->se_flags = cses->flags; 569 new->se_flags = cses->flags;
566 kref_init(&new->se_ref); 570 kref_init(&new->se_ref);
567 spin_lock(&sessionid_lock); 571 spin_lock(&client_lock);
568 list_add(&new->se_hash, &sessionid_hashtbl[idx]); 572 list_add(&new->se_hash, &sessionid_hashtbl[idx]);
569 list_add(&new->se_perclnt, &clp->cl_sessions); 573 list_add(&new->se_perclnt, &clp->cl_sessions);
570 spin_unlock(&sessionid_lock); 574 spin_unlock(&client_lock);
571 575
572 status = nfs_ok; 576 status = nfs_ok;
573out: 577out:
@@ -578,7 +582,7 @@ out_free:
578 goto out; 582 goto out;
579} 583}
580 584
581/* caller must hold sessionid_lock */ 585/* caller must hold client_lock */
582static struct nfsd4_session * 586static struct nfsd4_session *
583find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid) 587find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid)
584{ 588{
@@ -601,7 +605,7 @@ find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid)
601 return NULL; 605 return NULL;
602} 606}
603 607
604/* caller must hold sessionid_lock */ 608/* caller must hold client_lock */
605static void 609static void
606unhash_session(struct nfsd4_session *ses) 610unhash_session(struct nfsd4_session *ses)
607{ 611{
@@ -609,15 +613,6 @@ unhash_session(struct nfsd4_session *ses)
609 list_del(&ses->se_perclnt); 613 list_del(&ses->se_perclnt);
610} 614}
611 615
612static void
613release_session(struct nfsd4_session *ses)
614{
615 spin_lock(&sessionid_lock);
616 unhash_session(ses);
617 spin_unlock(&sessionid_lock);
618 nfsd4_put_session(ses);
619}
620
621void 616void
622free_session(struct kref *kref) 617free_session(struct kref *kref)
623{ 618{
@@ -633,9 +628,18 @@ free_session(struct kref *kref)
633 kfree(ses); 628 kfree(ses);
634} 629}
635 630
631/* must be called under the client_lock */
636static inline void 632static inline void
637renew_client(struct nfs4_client *clp) 633renew_client_locked(struct nfs4_client *clp)
638{ 634{
635 if (is_client_expired(clp)) {
636 dprintk("%s: client (clientid %08x/%08x) already expired\n",
637 __func__,
638 clp->cl_clientid.cl_boot,
639 clp->cl_clientid.cl_id);
640 return;
641 }
642
639 /* 643 /*
640 * Move client to the end to the LRU list. 644 * Move client to the end to the LRU list.
641 */ 645 */
@@ -646,6 +650,14 @@ renew_client(struct nfs4_client *clp)
646 clp->cl_time = get_seconds(); 650 clp->cl_time = get_seconds();
647} 651}
648 652
653static inline void
654renew_client(struct nfs4_client *clp)
655{
656 spin_lock(&client_lock);
657 renew_client_locked(clp);
658 spin_unlock(&client_lock);
659}
660
649/* SETCLIENTID and SETCLIENTID_CONFIRM Helper functions */ 661/* SETCLIENTID and SETCLIENTID_CONFIRM Helper functions */
650static int 662static int
651STALE_CLIENTID(clientid_t *clid) 663STALE_CLIENTID(clientid_t *clid)
@@ -679,27 +691,9 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
679 return clp; 691 return clp;
680} 692}
681 693
682static void
683shutdown_callback_client(struct nfs4_client *clp)
684{
685 struct rpc_clnt *clnt = clp->cl_cb_conn.cb_client;
686
687 if (clnt) {
688 /*
689 * Callback threads take a reference on the client, so there
690 * should be no outstanding callbacks at this point.
691 */
692 clp->cl_cb_conn.cb_client = NULL;
693 rpc_shutdown_client(clnt);
694 }
695}
696
697static inline void 694static inline void
698free_client(struct nfs4_client *clp) 695free_client(struct nfs4_client *clp)
699{ 696{
700 shutdown_callback_client(clp);
701 if (clp->cl_cb_xprt)
702 svc_xprt_put(clp->cl_cb_xprt);
703 if (clp->cl_cred.cr_group_info) 697 if (clp->cl_cred.cr_group_info)
704 put_group_info(clp->cl_cred.cr_group_info); 698 put_group_info(clp->cl_cred.cr_group_info);
705 kfree(clp->cl_principal); 699 kfree(clp->cl_principal);
@@ -708,10 +702,34 @@ free_client(struct nfs4_client *clp)
708} 702}
709 703
710void 704void
711put_nfs4_client(struct nfs4_client *clp) 705release_session_client(struct nfsd4_session *session)
712{ 706{
713 if (atomic_dec_and_test(&clp->cl_count)) 707 struct nfs4_client *clp = session->se_client;
708
709 if (!atomic_dec_and_lock(&clp->cl_refcount, &client_lock))
710 return;
711 if (is_client_expired(clp)) {
714 free_client(clp); 712 free_client(clp);
713 session->se_client = NULL;
714 } else
715 renew_client_locked(clp);
716 spin_unlock(&client_lock);
717 nfsd4_put_session(session);
718}
719
720/* must be called under the client_lock */
721static inline void
722unhash_client_locked(struct nfs4_client *clp)
723{
724 mark_client_expired(clp);
725 list_del(&clp->cl_lru);
726 while (!list_empty(&clp->cl_sessions)) {
727 struct nfsd4_session *ses;
728 ses = list_entry(clp->cl_sessions.next, struct nfsd4_session,
729 se_perclnt);
730 unhash_session(ses);
731 nfsd4_put_session(ses);
732 }
715} 733}
716 734
717static void 735static void
@@ -721,9 +739,6 @@ expire_client(struct nfs4_client *clp)
721 struct nfs4_delegation *dp; 739 struct nfs4_delegation *dp;
722 struct list_head reaplist; 740 struct list_head reaplist;
723 741
724 dprintk("NFSD: expire_client cl_count %d\n",
725 atomic_read(&clp->cl_count));
726
727 INIT_LIST_HEAD(&reaplist); 742 INIT_LIST_HEAD(&reaplist);
728 spin_lock(&recall_lock); 743 spin_lock(&recall_lock);
729 while (!list_empty(&clp->cl_delegations)) { 744 while (!list_empty(&clp->cl_delegations)) {
@@ -739,20 +754,20 @@ expire_client(struct nfs4_client *clp)
739 list_del_init(&dp->dl_recall_lru); 754 list_del_init(&dp->dl_recall_lru);
740 unhash_delegation(dp); 755 unhash_delegation(dp);
741 } 756 }
742 list_del(&clp->cl_idhash);
743 list_del(&clp->cl_strhash);
744 list_del(&clp->cl_lru);
745 while (!list_empty(&clp->cl_openowners)) { 757 while (!list_empty(&clp->cl_openowners)) {
746 sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient); 758 sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient);
747 release_openowner(sop); 759 release_openowner(sop);
748 } 760 }
749 while (!list_empty(&clp->cl_sessions)) { 761 nfsd4_set_callback_client(clp, NULL);
750 struct nfsd4_session *ses; 762 if (clp->cl_cb_conn.cb_xprt)
751 ses = list_entry(clp->cl_sessions.next, struct nfsd4_session, 763 svc_xprt_put(clp->cl_cb_conn.cb_xprt);
752 se_perclnt); 764 list_del(&clp->cl_idhash);
753 release_session(ses); 765 list_del(&clp->cl_strhash);
754 } 766 spin_lock(&client_lock);
755 put_nfs4_client(clp); 767 unhash_client_locked(clp);
768 if (atomic_read(&clp->cl_refcount) == 0)
769 free_client(clp);
770 spin_unlock(&client_lock);
756} 771}
757 772
758static void copy_verf(struct nfs4_client *target, nfs4_verifier *source) 773static void copy_verf(struct nfs4_client *target, nfs4_verifier *source)
@@ -838,14 +853,15 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
838 } 853 }
839 854
840 memcpy(clp->cl_recdir, recdir, HEXDIR_LEN); 855 memcpy(clp->cl_recdir, recdir, HEXDIR_LEN);
841 atomic_set(&clp->cl_count, 1); 856 atomic_set(&clp->cl_refcount, 0);
842 atomic_set(&clp->cl_cb_conn.cb_set, 0); 857 atomic_set(&clp->cl_cb_set, 0);
843 INIT_LIST_HEAD(&clp->cl_idhash); 858 INIT_LIST_HEAD(&clp->cl_idhash);
844 INIT_LIST_HEAD(&clp->cl_strhash); 859 INIT_LIST_HEAD(&clp->cl_strhash);
845 INIT_LIST_HEAD(&clp->cl_openowners); 860 INIT_LIST_HEAD(&clp->cl_openowners);
846 INIT_LIST_HEAD(&clp->cl_delegations); 861 INIT_LIST_HEAD(&clp->cl_delegations);
847 INIT_LIST_HEAD(&clp->cl_sessions); 862 INIT_LIST_HEAD(&clp->cl_sessions);
848 INIT_LIST_HEAD(&clp->cl_lru); 863 INIT_LIST_HEAD(&clp->cl_lru);
864 clp->cl_time = get_seconds();
849 clear_bit(0, &clp->cl_cb_slot_busy); 865 clear_bit(0, &clp->cl_cb_slot_busy);
850 rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table"); 866 rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
851 copy_verf(clp, verf); 867 copy_verf(clp, verf);
@@ -876,8 +892,7 @@ add_to_unconfirmed(struct nfs4_client *clp, unsigned int strhashval)
876 list_add(&clp->cl_strhash, &unconf_str_hashtbl[strhashval]); 892 list_add(&clp->cl_strhash, &unconf_str_hashtbl[strhashval]);
877 idhashval = clientid_hashval(clp->cl_clientid.cl_id); 893 idhashval = clientid_hashval(clp->cl_clientid.cl_id);
878 list_add(&clp->cl_idhash, &unconf_id_hashtbl[idhashval]); 894 list_add(&clp->cl_idhash, &unconf_id_hashtbl[idhashval]);
879 list_add_tail(&clp->cl_lru, &client_lru); 895 renew_client(clp);
880 clp->cl_time = get_seconds();
881} 896}
882 897
883static void 898static void
@@ -887,10 +902,9 @@ move_to_confirmed(struct nfs4_client *clp)
887 unsigned int strhashval; 902 unsigned int strhashval;
888 903
889 dprintk("NFSD: move_to_confirm nfs4_client %p\n", clp); 904 dprintk("NFSD: move_to_confirm nfs4_client %p\n", clp);
890 list_del_init(&clp->cl_strhash);
891 list_move(&clp->cl_idhash, &conf_id_hashtbl[idhashval]); 905 list_move(&clp->cl_idhash, &conf_id_hashtbl[idhashval]);
892 strhashval = clientstr_hashval(clp->cl_recdir); 906 strhashval = clientstr_hashval(clp->cl_recdir);
893 list_add(&clp->cl_strhash, &conf_str_hashtbl[strhashval]); 907 list_move(&clp->cl_strhash, &conf_str_hashtbl[strhashval]);
894 renew_client(clp); 908 renew_client(clp);
895} 909}
896 910
@@ -1326,15 +1340,9 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1326 cs_slot->sl_seqid++; /* from 0 to 1 */ 1340 cs_slot->sl_seqid++; /* from 0 to 1 */
1327 move_to_confirmed(unconf); 1341 move_to_confirmed(unconf);
1328 1342
1329 /*
1330 * We do not support RDMA or persistent sessions
1331 */
1332 cr_ses->flags &= ~SESSION4_PERSIST;
1333 cr_ses->flags &= ~SESSION4_RDMA;
1334
1335 if (cr_ses->flags & SESSION4_BACK_CHAN) { 1343 if (cr_ses->flags & SESSION4_BACK_CHAN) {
1336 unconf->cl_cb_xprt = rqstp->rq_xprt; 1344 unconf->cl_cb_conn.cb_xprt = rqstp->rq_xprt;
1337 svc_xprt_get(unconf->cl_cb_xprt); 1345 svc_xprt_get(rqstp->rq_xprt);
1338 rpc_copy_addr( 1346 rpc_copy_addr(
1339 (struct sockaddr *)&unconf->cl_cb_conn.cb_addr, 1347 (struct sockaddr *)&unconf->cl_cb_conn.cb_addr,
1340 sa); 1348 sa);
@@ -1343,7 +1351,7 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1343 cstate->minorversion; 1351 cstate->minorversion;
1344 unconf->cl_cb_conn.cb_prog = cr_ses->callback_prog; 1352 unconf->cl_cb_conn.cb_prog = cr_ses->callback_prog;
1345 unconf->cl_cb_seq_nr = 1; 1353 unconf->cl_cb_seq_nr = 1;
1346 nfsd4_probe_callback(unconf); 1354 nfsd4_probe_callback(unconf, &unconf->cl_cb_conn);
1347 } 1355 }
1348 conf = unconf; 1356 conf = unconf;
1349 } else { 1357 } else {
@@ -1351,6 +1359,12 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1351 goto out; 1359 goto out;
1352 } 1360 }
1353 1361
1362 /*
1363 * We do not support RDMA or persistent sessions
1364 */
1365 cr_ses->flags &= ~SESSION4_PERSIST;
1366 cr_ses->flags &= ~SESSION4_RDMA;
1367
1354 status = alloc_init_session(rqstp, conf, cr_ses); 1368 status = alloc_init_session(rqstp, conf, cr_ses);
1355 if (status) 1369 if (status)
1356 goto out; 1370 goto out;
@@ -1368,6 +1382,21 @@ out:
1368 return status; 1382 return status;
1369} 1383}
1370 1384
1385static bool nfsd4_last_compound_op(struct svc_rqst *rqstp)
1386{
1387 struct nfsd4_compoundres *resp = rqstp->rq_resp;
1388 struct nfsd4_compoundargs *argp = rqstp->rq_argp;
1389
1390 return argp->opcnt == resp->opcnt;
1391}
1392
1393static bool nfsd4_compound_in_session(struct nfsd4_session *session, struct nfs4_sessionid *sid)
1394{
1395 if (!session)
1396 return 0;
1397 return !memcmp(sid, &session->se_sessionid, sizeof(*sid));
1398}
1399
1371__be32 1400__be32
1372nfsd4_destroy_session(struct svc_rqst *r, 1401nfsd4_destroy_session(struct svc_rqst *r,
1373 struct nfsd4_compound_state *cstate, 1402 struct nfsd4_compound_state *cstate,
@@ -1383,19 +1412,25 @@ nfsd4_destroy_session(struct svc_rqst *r,
1383 * - Do we need to clear any callback info from previous session? 1412 * - Do we need to clear any callback info from previous session?
1384 */ 1413 */
1385 1414
1415 if (nfsd4_compound_in_session(cstate->session, &sessionid->sessionid)) {
1416 if (!nfsd4_last_compound_op(r))
1417 return nfserr_not_only_op;
1418 }
1386 dump_sessionid(__func__, &sessionid->sessionid); 1419 dump_sessionid(__func__, &sessionid->sessionid);
1387 spin_lock(&sessionid_lock); 1420 spin_lock(&client_lock);
1388 ses = find_in_sessionid_hashtbl(&sessionid->sessionid); 1421 ses = find_in_sessionid_hashtbl(&sessionid->sessionid);
1389 if (!ses) { 1422 if (!ses) {
1390 spin_unlock(&sessionid_lock); 1423 spin_unlock(&client_lock);
1391 goto out; 1424 goto out;
1392 } 1425 }
1393 1426
1394 unhash_session(ses); 1427 unhash_session(ses);
1395 spin_unlock(&sessionid_lock); 1428 spin_unlock(&client_lock);
1396 1429
1430 nfs4_lock_state();
1397 /* wait for callbacks */ 1431 /* wait for callbacks */
1398 shutdown_callback_client(ses->se_client); 1432 nfsd4_set_callback_client(ses->se_client, NULL);
1433 nfs4_unlock_state();
1399 nfsd4_put_session(ses); 1434 nfsd4_put_session(ses);
1400 status = nfs_ok; 1435 status = nfs_ok;
1401out: 1436out:
@@ -1416,7 +1451,7 @@ nfsd4_sequence(struct svc_rqst *rqstp,
1416 if (resp->opcnt != 1) 1451 if (resp->opcnt != 1)
1417 return nfserr_sequence_pos; 1452 return nfserr_sequence_pos;
1418 1453
1419 spin_lock(&sessionid_lock); 1454 spin_lock(&client_lock);
1420 status = nfserr_badsession; 1455 status = nfserr_badsession;
1421 session = find_in_sessionid_hashtbl(&seq->sessionid); 1456 session = find_in_sessionid_hashtbl(&seq->sessionid);
1422 if (!session) 1457 if (!session)
@@ -1455,23 +1490,47 @@ nfsd4_sequence(struct svc_rqst *rqstp,
1455 cstate->slot = slot; 1490 cstate->slot = slot;
1456 cstate->session = session; 1491 cstate->session = session;
1457 1492
1458 /* Hold a session reference until done processing the compound:
1459 * nfsd4_put_session called only if the cstate slot is set.
1460 */
1461 nfsd4_get_session(session);
1462out: 1493out:
1463 spin_unlock(&sessionid_lock); 1494 /* Hold a session reference until done processing the compound. */
1464 /* Renew the clientid on success and on replay */
1465 if (cstate->session) { 1495 if (cstate->session) {
1466 nfs4_lock_state(); 1496 nfsd4_get_session(cstate->session);
1467 renew_client(session->se_client); 1497 atomic_inc(&session->se_client->cl_refcount);
1468 nfs4_unlock_state();
1469 } 1498 }
1499 spin_unlock(&client_lock);
1470 dprintk("%s: return %d\n", __func__, ntohl(status)); 1500 dprintk("%s: return %d\n", __func__, ntohl(status));
1471 return status; 1501 return status;
1472} 1502}
1473 1503
1474__be32 1504__be32
1505nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_reclaim_complete *rc)
1506{
1507 if (rc->rca_one_fs) {
1508 if (!cstate->current_fh.fh_dentry)
1509 return nfserr_nofilehandle;
1510 /*
1511 * We don't take advantage of the rca_one_fs case.
1512 * That's OK, it's optional, we can safely ignore it.
1513 */
1514 return nfs_ok;
1515 }
1516 nfs4_lock_state();
1517 if (is_client_expired(cstate->session->se_client)) {
1518 nfs4_unlock_state();
1519 /*
1520 * The following error isn't really legal.
1521 * But we only get here if the client just explicitly
1522 * destroyed the client. Surely it no longer cares what
1523 * error it gets back on an operation for the dead
1524 * client.
1525 */
1526 return nfserr_stale_clientid;
1527 }
1528 nfsd4_create_clid_dir(cstate->session->se_client);
1529 nfs4_unlock_state();
1530 return nfs_ok;
1531}
1532
1533__be32
1475nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, 1534nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
1476 struct nfsd4_setclientid *setclid) 1535 struct nfsd4_setclientid *setclid)
1477{ 1536{
@@ -1630,9 +1689,8 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
1630 if (!same_creds(&conf->cl_cred, &unconf->cl_cred)) 1689 if (!same_creds(&conf->cl_cred, &unconf->cl_cred))
1631 status = nfserr_clid_inuse; 1690 status = nfserr_clid_inuse;
1632 else { 1691 else {
1633 /* XXX: We just turn off callbacks until we can handle 1692 atomic_set(&conf->cl_cb_set, 0);
1634 * change request correctly. */ 1693 nfsd4_probe_callback(conf, &unconf->cl_cb_conn);
1635 atomic_set(&conf->cl_cb_conn.cb_set, 0);
1636 expire_client(unconf); 1694 expire_client(unconf);
1637 status = nfs_ok; 1695 status = nfs_ok;
1638 1696
@@ -1666,7 +1724,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
1666 } 1724 }
1667 move_to_confirmed(unconf); 1725 move_to_confirmed(unconf);
1668 conf = unconf; 1726 conf = unconf;
1669 nfsd4_probe_callback(conf); 1727 nfsd4_probe_callback(conf, &conf->cl_cb_conn);
1670 status = nfs_ok; 1728 status = nfs_ok;
1671 } 1729 }
1672 } else if ((!conf || (conf && !same_verf(&conf->cl_confirm, &confirm))) 1730 } else if ((!conf || (conf && !same_verf(&conf->cl_confirm, &confirm)))
@@ -1699,12 +1757,12 @@ alloc_init_file(struct inode *ino)
1699 INIT_LIST_HEAD(&fp->fi_hash); 1757 INIT_LIST_HEAD(&fp->fi_hash);
1700 INIT_LIST_HEAD(&fp->fi_stateids); 1758 INIT_LIST_HEAD(&fp->fi_stateids);
1701 INIT_LIST_HEAD(&fp->fi_delegations); 1759 INIT_LIST_HEAD(&fp->fi_delegations);
1702 spin_lock(&recall_lock);
1703 list_add(&fp->fi_hash, &file_hashtbl[hashval]);
1704 spin_unlock(&recall_lock);
1705 fp->fi_inode = igrab(ino); 1760 fp->fi_inode = igrab(ino);
1706 fp->fi_id = current_fileid++; 1761 fp->fi_id = current_fileid++;
1707 fp->fi_had_conflict = false; 1762 fp->fi_had_conflict = false;
1763 spin_lock(&recall_lock);
1764 list_add(&fp->fi_hash, &file_hashtbl[hashval]);
1765 spin_unlock(&recall_lock);
1708 return fp; 1766 return fp;
1709 } 1767 }
1710 return NULL; 1768 return NULL;
@@ -1826,7 +1884,7 @@ init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *
1826 stp->st_stateowner = sop; 1884 stp->st_stateowner = sop;
1827 get_nfs4_file(fp); 1885 get_nfs4_file(fp);
1828 stp->st_file = fp; 1886 stp->st_file = fp;
1829 stp->st_stateid.si_boot = get_seconds(); 1887 stp->st_stateid.si_boot = boot_time;
1830 stp->st_stateid.si_stateownerid = sop->so_id; 1888 stp->st_stateid.si_stateownerid = sop->so_id;
1831 stp->st_stateid.si_fileid = fp->fi_id; 1889 stp->st_stateid.si_fileid = fp->fi_id;
1832 stp->st_stateid.si_generation = 0; 1890 stp->st_stateid.si_generation = 0;
@@ -2027,7 +2085,6 @@ void nfsd_break_deleg_cb(struct file_lock *fl)
2027 * lock) we know the server hasn't removed the lease yet, we know 2085 * lock) we know the server hasn't removed the lease yet, we know
2028 * it's safe to take a reference: */ 2086 * it's safe to take a reference: */
2029 atomic_inc(&dp->dl_count); 2087 atomic_inc(&dp->dl_count);
2030 atomic_inc(&dp->dl_client->cl_count);
2031 2088
2032 spin_lock(&recall_lock); 2089 spin_lock(&recall_lock);
2033 list_add_tail(&dp->dl_recall_lru, &del_recall_lru); 2090 list_add_tail(&dp->dl_recall_lru, &del_recall_lru);
@@ -2346,7 +2403,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
2346{ 2403{
2347 struct nfs4_delegation *dp; 2404 struct nfs4_delegation *dp;
2348 struct nfs4_stateowner *sop = stp->st_stateowner; 2405 struct nfs4_stateowner *sop = stp->st_stateowner;
2349 struct nfs4_cb_conn *cb = &sop->so_client->cl_cb_conn; 2406 int cb_up = atomic_read(&sop->so_client->cl_cb_set);
2350 struct file_lock fl, *flp = &fl; 2407 struct file_lock fl, *flp = &fl;
2351 int status, flag = 0; 2408 int status, flag = 0;
2352 2409
@@ -2354,7 +2411,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
2354 open->op_recall = 0; 2411 open->op_recall = 0;
2355 switch (open->op_claim_type) { 2412 switch (open->op_claim_type) {
2356 case NFS4_OPEN_CLAIM_PREVIOUS: 2413 case NFS4_OPEN_CLAIM_PREVIOUS:
2357 if (!atomic_read(&cb->cb_set)) 2414 if (!cb_up)
2358 open->op_recall = 1; 2415 open->op_recall = 1;
2359 flag = open->op_delegate_type; 2416 flag = open->op_delegate_type;
2360 if (flag == NFS4_OPEN_DELEGATE_NONE) 2417 if (flag == NFS4_OPEN_DELEGATE_NONE)
@@ -2365,7 +2422,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
2365 * had the chance to reclaim theirs.... */ 2422 * had the chance to reclaim theirs.... */
2366 if (locks_in_grace()) 2423 if (locks_in_grace())
2367 goto out; 2424 goto out;
2368 if (!atomic_read(&cb->cb_set) || !sop->so_confirmed) 2425 if (!cb_up || !sop->so_confirmed)
2369 goto out; 2426 goto out;
2370 if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) 2427 if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
2371 flag = NFS4_OPEN_DELEGATE_WRITE; 2428 flag = NFS4_OPEN_DELEGATE_WRITE;
@@ -2482,10 +2539,8 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
2482 } 2539 }
2483 memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t)); 2540 memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t));
2484 2541
2485 if (nfsd4_has_session(&resp->cstate)) { 2542 if (nfsd4_has_session(&resp->cstate))
2486 open->op_stateowner->so_confirmed = 1; 2543 open->op_stateowner->so_confirmed = 1;
2487 nfsd4_create_clid_dir(open->op_stateowner->so_client);
2488 }
2489 2544
2490 /* 2545 /*
2491 * Attempt to hand out a delegation. No error return, because the 2546 * Attempt to hand out a delegation. No error return, because the
@@ -2536,7 +2591,7 @@ nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2536 renew_client(clp); 2591 renew_client(clp);
2537 status = nfserr_cb_path_down; 2592 status = nfserr_cb_path_down;
2538 if (!list_empty(&clp->cl_delegations) 2593 if (!list_empty(&clp->cl_delegations)
2539 && !atomic_read(&clp->cl_cb_conn.cb_set)) 2594 && !atomic_read(&clp->cl_cb_set))
2540 goto out; 2595 goto out;
2541 status = nfs_ok; 2596 status = nfs_ok;
2542out: 2597out:
@@ -2553,6 +2608,12 @@ nfsd4_end_grace(void)
2553 dprintk("NFSD: end of grace period\n"); 2608 dprintk("NFSD: end of grace period\n");
2554 nfsd4_recdir_purge_old(); 2609 nfsd4_recdir_purge_old();
2555 locks_end_grace(&nfsd4_manager); 2610 locks_end_grace(&nfsd4_manager);
2611 /*
2612 * Now that every NFSv4 client has had the chance to recover and
2613 * to see the (possibly new, possibly shorter) lease time, we
2614 * can safely set the next grace time to the current lease time:
2615 */
2616 nfsd4_grace = nfsd4_lease;
2556} 2617}
2557 2618
2558static time_t 2619static time_t
@@ -2562,15 +2623,17 @@ nfs4_laundromat(void)
2562 struct nfs4_stateowner *sop; 2623 struct nfs4_stateowner *sop;
2563 struct nfs4_delegation *dp; 2624 struct nfs4_delegation *dp;
2564 struct list_head *pos, *next, reaplist; 2625 struct list_head *pos, *next, reaplist;
2565 time_t cutoff = get_seconds() - NFSD_LEASE_TIME; 2626 time_t cutoff = get_seconds() - nfsd4_lease;
2566 time_t t, clientid_val = NFSD_LEASE_TIME; 2627 time_t t, clientid_val = nfsd4_lease;
2567 time_t u, test_val = NFSD_LEASE_TIME; 2628 time_t u, test_val = nfsd4_lease;
2568 2629
2569 nfs4_lock_state(); 2630 nfs4_lock_state();
2570 2631
2571 dprintk("NFSD: laundromat service - starting\n"); 2632 dprintk("NFSD: laundromat service - starting\n");
2572 if (locks_in_grace()) 2633 if (locks_in_grace())
2573 nfsd4_end_grace(); 2634 nfsd4_end_grace();
2635 INIT_LIST_HEAD(&reaplist);
2636 spin_lock(&client_lock);
2574 list_for_each_safe(pos, next, &client_lru) { 2637 list_for_each_safe(pos, next, &client_lru) {
2575 clp = list_entry(pos, struct nfs4_client, cl_lru); 2638 clp = list_entry(pos, struct nfs4_client, cl_lru);
2576 if (time_after((unsigned long)clp->cl_time, (unsigned long)cutoff)) { 2639 if (time_after((unsigned long)clp->cl_time, (unsigned long)cutoff)) {
@@ -2579,12 +2642,22 @@ nfs4_laundromat(void)
2579 clientid_val = t; 2642 clientid_val = t;
2580 break; 2643 break;
2581 } 2644 }
2645 if (atomic_read(&clp->cl_refcount)) {
2646 dprintk("NFSD: client in use (clientid %08x)\n",
2647 clp->cl_clientid.cl_id);
2648 continue;
2649 }
2650 unhash_client_locked(clp);
2651 list_add(&clp->cl_lru, &reaplist);
2652 }
2653 spin_unlock(&client_lock);
2654 list_for_each_safe(pos, next, &reaplist) {
2655 clp = list_entry(pos, struct nfs4_client, cl_lru);
2582 dprintk("NFSD: purging unused client (clientid %08x)\n", 2656 dprintk("NFSD: purging unused client (clientid %08x)\n",
2583 clp->cl_clientid.cl_id); 2657 clp->cl_clientid.cl_id);
2584 nfsd4_remove_clid_dir(clp); 2658 nfsd4_remove_clid_dir(clp);
2585 expire_client(clp); 2659 expire_client(clp);
2586 } 2660 }
2587 INIT_LIST_HEAD(&reaplist);
2588 spin_lock(&recall_lock); 2661 spin_lock(&recall_lock);
2589 list_for_each_safe(pos, next, &del_recall_lru) { 2662 list_for_each_safe(pos, next, &del_recall_lru) {
2590 dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); 2663 dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
@@ -2604,7 +2677,7 @@ nfs4_laundromat(void)
2604 list_del_init(&dp->dl_recall_lru); 2677 list_del_init(&dp->dl_recall_lru);
2605 unhash_delegation(dp); 2678 unhash_delegation(dp);
2606 } 2679 }
2607 test_val = NFSD_LEASE_TIME; 2680 test_val = nfsd4_lease;
2608 list_for_each_safe(pos, next, &close_lru) { 2681 list_for_each_safe(pos, next, &close_lru) {
2609 sop = list_entry(pos, struct nfs4_stateowner, so_close_lru); 2682 sop = list_entry(pos, struct nfs4_stateowner, so_close_lru);
2610 if (time_after((unsigned long)sop->so_time, (unsigned long)cutoff)) { 2683 if (time_after((unsigned long)sop->so_time, (unsigned long)cutoff)) {
@@ -2660,39 +2733,11 @@ nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stateid *stp)
2660static int 2733static int
2661STALE_STATEID(stateid_t *stateid) 2734STALE_STATEID(stateid_t *stateid)
2662{ 2735{
2663 if (time_after((unsigned long)boot_time, 2736 if (stateid->si_boot == boot_time)
2664 (unsigned long)stateid->si_boot)) { 2737 return 0;
2665 dprintk("NFSD: stale stateid " STATEID_FMT "!\n", 2738 dprintk("NFSD: stale stateid " STATEID_FMT "!\n",
2666 STATEID_VAL(stateid));
2667 return 1;
2668 }
2669 return 0;
2670}
2671
2672static int
2673EXPIRED_STATEID(stateid_t *stateid)
2674{
2675 if (time_before((unsigned long)boot_time,
2676 ((unsigned long)stateid->si_boot)) &&
2677 time_before((unsigned long)(stateid->si_boot + lease_time), get_seconds())) {
2678 dprintk("NFSD: expired stateid " STATEID_FMT "!\n",
2679 STATEID_VAL(stateid));
2680 return 1;
2681 }
2682 return 0;
2683}
2684
2685static __be32
2686stateid_error_map(stateid_t *stateid)
2687{
2688 if (STALE_STATEID(stateid))
2689 return nfserr_stale_stateid;
2690 if (EXPIRED_STATEID(stateid))
2691 return nfserr_expired;
2692
2693 dprintk("NFSD: bad stateid " STATEID_FMT "!\n",
2694 STATEID_VAL(stateid)); 2739 STATEID_VAL(stateid));
2695 return nfserr_bad_stateid; 2740 return 1;
2696} 2741}
2697 2742
2698static inline int 2743static inline int
@@ -2816,10 +2861,8 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
2816 status = nfserr_bad_stateid; 2861 status = nfserr_bad_stateid;
2817 if (is_delegation_stateid(stateid)) { 2862 if (is_delegation_stateid(stateid)) {
2818 dp = find_delegation_stateid(ino, stateid); 2863 dp = find_delegation_stateid(ino, stateid);
2819 if (!dp) { 2864 if (!dp)
2820 status = stateid_error_map(stateid);
2821 goto out; 2865 goto out;
2822 }
2823 status = check_stateid_generation(stateid, &dp->dl_stateid, 2866 status = check_stateid_generation(stateid, &dp->dl_stateid,
2824 flags); 2867 flags);
2825 if (status) 2868 if (status)
@@ -2832,10 +2875,8 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
2832 *filpp = dp->dl_vfs_file; 2875 *filpp = dp->dl_vfs_file;
2833 } else { /* open or lock stateid */ 2876 } else { /* open or lock stateid */
2834 stp = find_stateid(stateid, flags); 2877 stp = find_stateid(stateid, flags);
2835 if (!stp) { 2878 if (!stp)
2836 status = stateid_error_map(stateid);
2837 goto out; 2879 goto out;
2838 }
2839 if (nfs4_check_fh(current_fh, stp)) 2880 if (nfs4_check_fh(current_fh, stp))
2840 goto out; 2881 goto out;
2841 if (!stp->st_stateowner->so_confirmed) 2882 if (!stp->st_stateowner->so_confirmed)
@@ -2907,7 +2948,7 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
2907 */ 2948 */
2908 sop = search_close_lru(stateid->si_stateownerid, flags); 2949 sop = search_close_lru(stateid->si_stateownerid, flags);
2909 if (sop == NULL) 2950 if (sop == NULL)
2910 return stateid_error_map(stateid); 2951 return nfserr_bad_stateid;
2911 *sopp = sop; 2952 *sopp = sop;
2912 goto check_replay; 2953 goto check_replay;
2913 } 2954 }
@@ -3174,10 +3215,8 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3174 if (!is_delegation_stateid(stateid)) 3215 if (!is_delegation_stateid(stateid))
3175 goto out; 3216 goto out;
3176 dp = find_delegation_stateid(inode, stateid); 3217 dp = find_delegation_stateid(inode, stateid);
3177 if (!dp) { 3218 if (!dp)
3178 status = stateid_error_map(stateid);
3179 goto out; 3219 goto out;
3180 }
3181 status = check_stateid_generation(stateid, &dp->dl_stateid, flags); 3220 status = check_stateid_generation(stateid, &dp->dl_stateid, flags);
3182 if (status) 3221 if (status)
3183 goto out; 3222 goto out;
@@ -3403,7 +3442,7 @@ alloc_init_lock_stateid(struct nfs4_stateowner *sop, struct nfs4_file *fp, struc
3403 stp->st_stateowner = sop; 3442 stp->st_stateowner = sop;
3404 get_nfs4_file(fp); 3443 get_nfs4_file(fp);
3405 stp->st_file = fp; 3444 stp->st_file = fp;
3406 stp->st_stateid.si_boot = get_seconds(); 3445 stp->st_stateid.si_boot = boot_time;
3407 stp->st_stateid.si_stateownerid = sop->so_id; 3446 stp->st_stateid.si_stateownerid = sop->so_id;
3408 stp->st_stateid.si_fileid = fp->fi_id; 3447 stp->st_stateid.si_fileid = fp->fi_id;
3409 stp->st_stateid.si_generation = 0; 3448 stp->st_stateid.si_generation = 0;
@@ -3975,12 +4014,6 @@ nfsd4_load_reboot_recovery_data(void)
3975 printk("NFSD: Failure reading reboot recovery data\n"); 4014 printk("NFSD: Failure reading reboot recovery data\n");
3976} 4015}
3977 4016
3978unsigned long
3979get_nfs4_grace_period(void)
3980{
3981 return max(user_lease_time, lease_time) * HZ;
3982}
3983
3984/* 4017/*
3985 * Since the lifetime of a delegation isn't limited to that of an open, a 4018 * Since the lifetime of a delegation isn't limited to that of an open, a
3986 * client may quite reasonably hang on to a delegation as long as it has 4019 * client may quite reasonably hang on to a delegation as long as it has
@@ -4007,20 +4040,27 @@ set_max_delegations(void)
4007static int 4040static int
4008__nfs4_state_start(void) 4041__nfs4_state_start(void)
4009{ 4042{
4010 unsigned long grace_time; 4043 int ret;
4011 4044
4012 boot_time = get_seconds(); 4045 boot_time = get_seconds();
4013 grace_time = get_nfs4_grace_period();
4014 lease_time = user_lease_time;
4015 locks_start_grace(&nfsd4_manager); 4046 locks_start_grace(&nfsd4_manager);
4016 printk(KERN_INFO "NFSD: starting %ld-second grace period\n", 4047 printk(KERN_INFO "NFSD: starting %ld-second grace period\n",
4017 grace_time/HZ); 4048 nfsd4_grace);
4049 ret = set_callback_cred();
4050 if (ret)
4051 return -ENOMEM;
4018 laundry_wq = create_singlethread_workqueue("nfsd4"); 4052 laundry_wq = create_singlethread_workqueue("nfsd4");
4019 if (laundry_wq == NULL) 4053 if (laundry_wq == NULL)
4020 return -ENOMEM; 4054 return -ENOMEM;
4021 queue_delayed_work(laundry_wq, &laundromat_work, grace_time); 4055 ret = nfsd4_create_callback_queue();
4056 if (ret)
4057 goto out_free_laundry;
4058 queue_delayed_work(laundry_wq, &laundromat_work, nfsd4_grace * HZ);
4022 set_max_delegations(); 4059 set_max_delegations();
4023 return set_callback_cred(); 4060 return 0;
4061out_free_laundry:
4062 destroy_workqueue(laundry_wq);
4063 return ret;
4024} 4064}
4025 4065
4026int 4066int
@@ -4038,12 +4078,6 @@ nfs4_state_start(void)
4038 return 0; 4078 return 0;
4039} 4079}
4040 4080
4041time_t
4042nfs4_lease_time(void)
4043{
4044 return lease_time;
4045}
4046
4047static void 4081static void
4048__nfs4_state_shutdown(void) 4082__nfs4_state_shutdown(void)
4049{ 4083{
@@ -4088,6 +4122,7 @@ nfs4_state_shutdown(void)
4088 nfs4_lock_state(); 4122 nfs4_lock_state();
4089 nfs4_release_reclaim(); 4123 nfs4_release_reclaim();
4090 __nfs4_state_shutdown(); 4124 __nfs4_state_shutdown();
4125 nfsd4_destroy_callback_queue();
4091 nfs4_unlock_state(); 4126 nfs4_unlock_state();
4092} 4127}
4093 4128
@@ -4127,21 +4162,3 @@ nfs4_recoverydir(void)
4127{ 4162{
4128 return user_recovery_dirname; 4163 return user_recovery_dirname;
4129} 4164}
4130
4131/*
4132 * Called when leasetime is changed.
4133 *
4134 * The only way the protocol gives us to handle on-the-fly lease changes is to
4135 * simulate a reboot. Instead of doing that, we just wait till the next time
4136 * we start to register any changes in lease time. If the administrator
4137 * really wants to change the lease time *now*, they can go ahead and bring
4138 * nfsd down and then back up again after changing the lease time.
4139 *
4140 * user_lease_time is protected by nfsd_mutex since it's only really accessed
4141 * when nfsd is starting
4142 */
4143void
4144nfs4_reset_lease(time_t leasetime)
4145{
4146 user_lease_time = leasetime;
4147}
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index c47b4d7bafa7..ac17a7080239 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -40,6 +40,7 @@
40 * at the end of nfs4svc_decode_compoundargs. 40 * at the end of nfs4svc_decode_compoundargs.
41 */ 41 */
42 42
43#include <linux/slab.h>
43#include <linux/namei.h> 44#include <linux/namei.h>
44#include <linux/statfs.h> 45#include <linux/statfs.h>
45#include <linux/utsname.h> 46#include <linux/utsname.h>
@@ -160,10 +161,10 @@ static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes)
160 argp->p = page_address(argp->pagelist[0]); 161 argp->p = page_address(argp->pagelist[0]);
161 argp->pagelist++; 162 argp->pagelist++;
162 if (argp->pagelen < PAGE_SIZE) { 163 if (argp->pagelen < PAGE_SIZE) {
163 argp->end = p + (argp->pagelen>>2); 164 argp->end = argp->p + (argp->pagelen>>2);
164 argp->pagelen = 0; 165 argp->pagelen = 0;
165 } else { 166 } else {
166 argp->end = p + (PAGE_SIZE>>2); 167 argp->end = argp->p + (PAGE_SIZE>>2);
167 argp->pagelen -= PAGE_SIZE; 168 argp->pagelen -= PAGE_SIZE;
168 } 169 }
169 memcpy(((char*)p)+avail, argp->p, (nbytes - avail)); 170 memcpy(((char*)p)+avail, argp->p, (nbytes - avail));
@@ -1233,6 +1234,16 @@ nfsd4_decode_sequence(struct nfsd4_compoundargs *argp,
1233 DECODE_TAIL; 1234 DECODE_TAIL;
1234} 1235}
1235 1236
1237static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, struct nfsd4_reclaim_complete *rc)
1238{
1239 DECODE_HEAD;
1240
1241 READ_BUF(4);
1242 READ32(rc->rca_one_fs);
1243
1244 DECODE_TAIL;
1245}
1246
1236static __be32 1247static __be32
1237nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p) 1248nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p)
1238{ 1249{
@@ -1345,7 +1356,7 @@ static nfsd4_dec nfsd41_dec_ops[] = {
1345 [OP_TEST_STATEID] = (nfsd4_dec)nfsd4_decode_notsupp, 1356 [OP_TEST_STATEID] = (nfsd4_dec)nfsd4_decode_notsupp,
1346 [OP_WANT_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp, 1357 [OP_WANT_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp,
1347 [OP_DESTROY_CLIENTID] = (nfsd4_dec)nfsd4_decode_notsupp, 1358 [OP_DESTROY_CLIENTID] = (nfsd4_dec)nfsd4_decode_notsupp,
1348 [OP_RECLAIM_COMPLETE] = (nfsd4_dec)nfsd4_decode_notsupp, 1359 [OP_RECLAIM_COMPLETE] = (nfsd4_dec)nfsd4_decode_reclaim_complete,
1349}; 1360};
1350 1361
1351struct nfsd4_minorversion_ops { 1362struct nfsd4_minorversion_ops {
@@ -1425,10 +1436,10 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
1425 argp->p = page_address(argp->pagelist[0]); 1436 argp->p = page_address(argp->pagelist[0]);
1426 argp->pagelist++; 1437 argp->pagelist++;
1427 if (argp->pagelen < PAGE_SIZE) { 1438 if (argp->pagelen < PAGE_SIZE) {
1428 argp->end = p + (argp->pagelen>>2); 1439 argp->end = argp->p + (argp->pagelen>>2);
1429 argp->pagelen = 0; 1440 argp->pagelen = 0;
1430 } else { 1441 } else {
1431 argp->end = p + (PAGE_SIZE>>2); 1442 argp->end = argp->p + (PAGE_SIZE>>2);
1432 argp->pagelen -= PAGE_SIZE; 1443 argp->pagelen -= PAGE_SIZE;
1433 } 1444 }
1434 } 1445 }
@@ -1899,7 +1910,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
1899 if (bmval0 & FATTR4_WORD0_LEASE_TIME) { 1910 if (bmval0 & FATTR4_WORD0_LEASE_TIME) {
1900 if ((buflen -= 4) < 0) 1911 if ((buflen -= 4) < 0)
1901 goto out_resource; 1912 goto out_resource;
1902 WRITE32(NFSD_LEASE_TIME); 1913 WRITE32(nfsd4_lease);
1903 } 1914 }
1904 if (bmval0 & FATTR4_WORD0_RDATTR_ERROR) { 1915 if (bmval0 & FATTR4_WORD0_RDATTR_ERROR) {
1905 if ((buflen -= 4) < 0) 1916 if ((buflen -= 4) < 0)
@@ -3306,11 +3317,14 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo
3306 iov = &rqstp->rq_res.head[0]; 3317 iov = &rqstp->rq_res.head[0];
3307 iov->iov_len = ((char*)resp->p) - (char*)iov->iov_base; 3318 iov->iov_len = ((char*)resp->p) - (char*)iov->iov_base;
3308 BUG_ON(iov->iov_len > PAGE_SIZE); 3319 BUG_ON(iov->iov_len > PAGE_SIZE);
3309 if (nfsd4_has_session(cs) && cs->status != nfserr_replay_cache) { 3320 if (nfsd4_has_session(cs)) {
3310 nfsd4_store_cache_entry(resp); 3321 if (cs->status != nfserr_replay_cache) {
3311 dprintk("%s: SET SLOT STATE TO AVAILABLE\n", __func__); 3322 nfsd4_store_cache_entry(resp);
3312 resp->cstate.slot->sl_inuse = false; 3323 dprintk("%s: SET SLOT STATE TO AVAILABLE\n", __func__);
3313 nfsd4_put_session(resp->cstate.session); 3324 cs->slot->sl_inuse = false;
3325 }
3326 /* Renew the clientid on success and on replay */
3327 release_session_client(cs->session);
3314 } 3328 }
3315 return 1; 3329 return 1;
3316} 3330}
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index da08560c4818..4666a209678a 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -8,6 +8,8 @@
8 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> 8 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
9 */ 9 */
10 10
11#include <linux/slab.h>
12
11#include "nfsd.h" 13#include "nfsd.h"
12#include "cache.h" 14#include "cache.h"
13 15
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 0f0e77f2012f..508941c23af7 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -4,6 +4,7 @@
4 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> 4 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
5 */ 5 */
6 6
7#include <linux/slab.h>
7#include <linux/namei.h> 8#include <linux/namei.h>
8#include <linux/ctype.h> 9#include <linux/ctype.h>
9 10
@@ -45,6 +46,7 @@ enum {
45 */ 46 */
46#ifdef CONFIG_NFSD_V4 47#ifdef CONFIG_NFSD_V4
47 NFSD_Leasetime, 48 NFSD_Leasetime,
49 NFSD_Gracetime,
48 NFSD_RecoveryDir, 50 NFSD_RecoveryDir,
49#endif 51#endif
50}; 52};
@@ -69,6 +71,7 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size);
69static ssize_t write_maxblksize(struct file *file, char *buf, size_t size); 71static ssize_t write_maxblksize(struct file *file, char *buf, size_t size);
70#ifdef CONFIG_NFSD_V4 72#ifdef CONFIG_NFSD_V4
71static ssize_t write_leasetime(struct file *file, char *buf, size_t size); 73static ssize_t write_leasetime(struct file *file, char *buf, size_t size);
74static ssize_t write_gracetime(struct file *file, char *buf, size_t size);
72static ssize_t write_recoverydir(struct file *file, char *buf, size_t size); 75static ssize_t write_recoverydir(struct file *file, char *buf, size_t size);
73#endif 76#endif
74 77
@@ -90,6 +93,7 @@ static ssize_t (*write_op[])(struct file *, char *, size_t) = {
90 [NFSD_MaxBlkSize] = write_maxblksize, 93 [NFSD_MaxBlkSize] = write_maxblksize,
91#ifdef CONFIG_NFSD_V4 94#ifdef CONFIG_NFSD_V4
92 [NFSD_Leasetime] = write_leasetime, 95 [NFSD_Leasetime] = write_leasetime,
96 [NFSD_Gracetime] = write_gracetime,
93 [NFSD_RecoveryDir] = write_recoverydir, 97 [NFSD_RecoveryDir] = write_recoverydir,
94#endif 98#endif
95}; 99};
@@ -994,7 +998,7 @@ static ssize_t __write_ports_addxprt(char *buf)
994 if (sscanf(buf, "%15s %4u", transport, &port) != 2) 998 if (sscanf(buf, "%15s %4u", transport, &port) != 2)
995 return -EINVAL; 999 return -EINVAL;
996 1000
997 if (port < 1 || port > USHORT_MAX) 1001 if (port < 1 || port > USHRT_MAX)
998 return -EINVAL; 1002 return -EINVAL;
999 1003
1000 err = nfsd_create_serv(); 1004 err = nfsd_create_serv();
@@ -1036,7 +1040,7 @@ static ssize_t __write_ports_delxprt(char *buf)
1036 if (sscanf(&buf[1], "%15s %4u", transport, &port) != 2) 1040 if (sscanf(&buf[1], "%15s %4u", transport, &port) != 2)
1037 return -EINVAL; 1041 return -EINVAL;
1038 1042
1039 if (port < 1 || port > USHORT_MAX || nfsd_serv == NULL) 1043 if (port < 1 || port > USHRT_MAX || nfsd_serv == NULL)
1040 return -EINVAL; 1044 return -EINVAL;
1041 1045
1042 xprt = svc_find_xprt(nfsd_serv, transport, AF_UNSPEC, port); 1046 xprt = svc_find_xprt(nfsd_serv, transport, AF_UNSPEC, port);
@@ -1203,29 +1207,45 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
1203} 1207}
1204 1208
1205#ifdef CONFIG_NFSD_V4 1209#ifdef CONFIG_NFSD_V4
1206extern time_t nfs4_leasetime(void); 1210static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size, time_t *time)
1207
1208static ssize_t __write_leasetime(struct file *file, char *buf, size_t size)
1209{ 1211{
1210 /* if size > 10 seconds, call
1211 * nfs4_reset_lease() then write out the new lease (seconds) as reply
1212 */
1213 char *mesg = buf; 1212 char *mesg = buf;
1214 int rv, lease; 1213 int rv, i;
1215 1214
1216 if (size > 0) { 1215 if (size > 0) {
1217 if (nfsd_serv) 1216 if (nfsd_serv)
1218 return -EBUSY; 1217 return -EBUSY;
1219 rv = get_int(&mesg, &lease); 1218 rv = get_int(&mesg, &i);
1220 if (rv) 1219 if (rv)
1221 return rv; 1220 return rv;
1222 if (lease < 10 || lease > 3600) 1221 /*
1222 * Some sanity checking. We don't have a reason for
1223 * these particular numbers, but problems with the
1224 * extremes are:
1225 * - Too short: the briefest network outage may
1226 * cause clients to lose all their locks. Also,
1227 * the frequent polling may be wasteful.
1228 * - Too long: do you really want reboot recovery
1229 * to take more than an hour? Or to make other
1230 * clients wait an hour before being able to
1231 * revoke a dead client's locks?
1232 */
1233 if (i < 10 || i > 3600)
1223 return -EINVAL; 1234 return -EINVAL;
1224 nfs4_reset_lease(lease); 1235 *time = i;
1225 } 1236 }
1226 1237
1227 return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%ld\n", 1238 return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%ld\n", *time);
1228 nfs4_lease_time()); 1239}
1240
1241static ssize_t nfsd4_write_time(struct file *file, char *buf, size_t size, time_t *time)
1242{
1243 ssize_t rv;
1244
1245 mutex_lock(&nfsd_mutex);
1246 rv = __nfsd4_write_time(file, buf, size, time);
1247 mutex_unlock(&nfsd_mutex);
1248 return rv;
1229} 1249}
1230 1250
1231/** 1251/**
@@ -1251,12 +1271,22 @@ static ssize_t __write_leasetime(struct file *file, char *buf, size_t size)
1251 */ 1271 */
1252static ssize_t write_leasetime(struct file *file, char *buf, size_t size) 1272static ssize_t write_leasetime(struct file *file, char *buf, size_t size)
1253{ 1273{
1254 ssize_t rv; 1274 return nfsd4_write_time(file, buf, size, &nfsd4_lease);
1275}
1255 1276
1256 mutex_lock(&nfsd_mutex); 1277/**
1257 rv = __write_leasetime(file, buf, size); 1278 * write_gracetime - Set or report current NFSv4 grace period time
1258 mutex_unlock(&nfsd_mutex); 1279 *
1259 return rv; 1280 * As above, but sets the time of the NFSv4 grace period.
1281 *
1282 * Note this should never be set to less than the *previous*
1283 * lease-period time, but we don't try to enforce this. (In the common
1284 * case (a new boot), we don't know what the previous lease time was
1285 * anyway.)
1286 */
1287static ssize_t write_gracetime(struct file *file, char *buf, size_t size)
1288{
1289 return nfsd4_write_time(file, buf, size, &nfsd4_grace);
1260} 1290}
1261 1291
1262extern char *nfs4_recoverydir(void); 1292extern char *nfs4_recoverydir(void);
@@ -1350,6 +1380,7 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
1350 [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO}, 1380 [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO},
1351#ifdef CONFIG_NFSD_V4 1381#ifdef CONFIG_NFSD_V4
1352 [NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR}, 1382 [NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR},
1383 [NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR},
1353 [NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR}, 1384 [NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR},
1354#endif 1385#endif
1355 /* last one */ {""} 1386 /* last one */ {""}
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index e942a1aaac92..72377761270e 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -82,7 +82,6 @@ int nfs4_state_init(void);
82void nfsd4_free_slabs(void); 82void nfsd4_free_slabs(void);
83int nfs4_state_start(void); 83int nfs4_state_start(void);
84void nfs4_state_shutdown(void); 84void nfs4_state_shutdown(void);
85time_t nfs4_lease_time(void);
86void nfs4_reset_lease(time_t leasetime); 85void nfs4_reset_lease(time_t leasetime);
87int nfs4_reset_recoverydir(char *recdir); 86int nfs4_reset_recoverydir(char *recdir);
88#else 87#else
@@ -90,7 +89,6 @@ static inline int nfs4_state_init(void) { return 0; }
90static inline void nfsd4_free_slabs(void) { } 89static inline void nfsd4_free_slabs(void) { }
91static inline int nfs4_state_start(void) { return 0; } 90static inline int nfs4_state_start(void) { return 0; }
92static inline void nfs4_state_shutdown(void) { } 91static inline void nfs4_state_shutdown(void) { }
93static inline time_t nfs4_lease_time(void) { return 0; }
94static inline void nfs4_reset_lease(time_t leasetime) { } 92static inline void nfs4_reset_lease(time_t leasetime) { }
95static inline int nfs4_reset_recoverydir(char *recdir) { return 0; } 93static inline int nfs4_reset_recoverydir(char *recdir) { return 0; }
96#endif 94#endif
@@ -229,6 +227,9 @@ extern struct timeval nfssvc_boot;
229 227
230#ifdef CONFIG_NFSD_V4 228#ifdef CONFIG_NFSD_V4
231 229
230extern time_t nfsd4_lease;
231extern time_t nfsd4_grace;
232
232/* before processing a COMPOUND operation, we have to check that there 233/* before processing a COMPOUND operation, we have to check that there
233 * is enough space in the buffer for XDR encode to succeed. otherwise, 234 * is enough space in the buffer for XDR encode to succeed. otherwise,
234 * we might process an operation with side effects, and be unable to 235 * we might process an operation with side effects, and be unable to
@@ -247,7 +248,6 @@ extern struct timeval nfssvc_boot;
247#define COMPOUND_SLACK_SPACE 140 /* OP_GETFH */ 248#define COMPOUND_SLACK_SPACE 140 /* OP_GETFH */
248#define COMPOUND_ERR_SLACK_SPACE 12 /* OP_SETATTR */ 249#define COMPOUND_ERR_SLACK_SPACE 12 /* OP_SETATTR */
249 250
250#define NFSD_LEASE_TIME (nfs4_lease_time())
251#define NFSD_LAUNDROMAT_MINTIMEOUT 10 /* seconds */ 251#define NFSD_LAUNDROMAT_MINTIMEOUT 10 /* seconds */
252 252
253/* 253/*
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 171699eb07c8..06b2a26edfe0 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -120,7 +120,7 @@ u32 nfsd_supported_minorversion;
120int nfsd_vers(int vers, enum vers_op change) 120int nfsd_vers(int vers, enum vers_op change)
121{ 121{
122 if (vers < NFSD_MINVERS || vers >= NFSD_NRVERS) 122 if (vers < NFSD_MINVERS || vers >= NFSD_NRVERS)
123 return -1; 123 return 0;
124 switch(change) { 124 switch(change) {
125 case NFSD_SET: 125 case NFSD_SET:
126 nfsd_versions[vers] = nfsd_version[vers]; 126 nfsd_versions[vers] = nfsd_version[vers];
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index fefeae27f25e..006c84230c7c 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -70,6 +70,16 @@ struct nfsd4_cb_sequence {
70 struct nfs4_client *cbs_clp; 70 struct nfs4_client *cbs_clp;
71}; 71};
72 72
73struct nfs4_rpc_args {
74 void *args_op;
75 struct nfsd4_cb_sequence args_seq;
76};
77
78struct nfsd4_callback {
79 struct nfs4_rpc_args cb_args;
80 struct work_struct cb_work;
81};
82
73struct nfs4_delegation { 83struct nfs4_delegation {
74 struct list_head dl_perfile; 84 struct list_head dl_perfile;
75 struct list_head dl_perclnt; 85 struct list_head dl_perclnt;
@@ -86,6 +96,7 @@ struct nfs4_delegation {
86 stateid_t dl_stateid; 96 stateid_t dl_stateid;
87 struct knfsd_fh dl_fh; 97 struct knfsd_fh dl_fh;
88 int dl_retries; 98 int dl_retries;
99 struct nfsd4_callback dl_recall;
89}; 100};
90 101
91/* client delegation callback info */ 102/* client delegation callback info */
@@ -96,9 +107,7 @@ struct nfs4_cb_conn {
96 u32 cb_prog; 107 u32 cb_prog;
97 u32 cb_minorversion; 108 u32 cb_minorversion;
98 u32 cb_ident; /* minorversion 0 only */ 109 u32 cb_ident; /* minorversion 0 only */
99 /* RPC client info */ 110 struct svc_xprt *cb_xprt; /* minorversion 1 only */
100 atomic_t cb_set; /* successful CB_NULL call */
101 struct rpc_clnt * cb_client;
102}; 111};
103 112
104/* Maximum number of slots per session. 160 is useful for long haul TCP */ 113/* Maximum number of slots per session. 160 is useful for long haul TCP */
@@ -157,7 +166,7 @@ struct nfsd4_session {
157 struct list_head se_hash; /* hash by sessionid */ 166 struct list_head se_hash; /* hash by sessionid */
158 struct list_head se_perclnt; 167 struct list_head se_perclnt;
159 u32 se_flags; 168 u32 se_flags;
160 struct nfs4_client *se_client; /* for expire_client */ 169 struct nfs4_client *se_client;
161 struct nfs4_sessionid se_sessionid; 170 struct nfs4_sessionid se_sessionid;
162 struct nfsd4_channel_attrs se_fchannel; 171 struct nfsd4_channel_attrs se_fchannel;
163 struct nfsd4_channel_attrs se_bchannel; 172 struct nfsd4_channel_attrs se_bchannel;
@@ -212,25 +221,41 @@ struct nfs4_client {
212 struct svc_cred cl_cred; /* setclientid principal */ 221 struct svc_cred cl_cred; /* setclientid principal */
213 clientid_t cl_clientid; /* generated by server */ 222 clientid_t cl_clientid; /* generated by server */
214 nfs4_verifier cl_confirm; /* generated by server */ 223 nfs4_verifier cl_confirm; /* generated by server */
215 struct nfs4_cb_conn cl_cb_conn; /* callback info */
216 atomic_t cl_count; /* ref count */
217 u32 cl_firststate; /* recovery dir creation */ 224 u32 cl_firststate; /* recovery dir creation */
218 225
226 /* for v4.0 and v4.1 callbacks: */
227 struct nfs4_cb_conn cl_cb_conn;
228 struct rpc_clnt *cl_cb_client;
229 atomic_t cl_cb_set;
230
219 /* for nfs41 */ 231 /* for nfs41 */
220 struct list_head cl_sessions; 232 struct list_head cl_sessions;
221 struct nfsd4_clid_slot cl_cs_slot; /* create_session slot */ 233 struct nfsd4_clid_slot cl_cs_slot; /* create_session slot */
222 u32 cl_exchange_flags; 234 u32 cl_exchange_flags;
223 struct nfs4_sessionid cl_sessionid; 235 struct nfs4_sessionid cl_sessionid;
236 /* number of rpc's in progress over an associated session: */
237 atomic_t cl_refcount;
224 238
225 /* for nfs41 callbacks */ 239 /* for nfs41 callbacks */
226 /* We currently support a single back channel with a single slot */ 240 /* We currently support a single back channel with a single slot */
227 unsigned long cl_cb_slot_busy; 241 unsigned long cl_cb_slot_busy;
228 u32 cl_cb_seq_nr; 242 u32 cl_cb_seq_nr;
229 struct svc_xprt *cl_cb_xprt; /* 4.1 callback transport */
230 struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */ 243 struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */
231 /* wait here for slots */ 244 /* wait here for slots */
232}; 245};
233 246
247static inline void
248mark_client_expired(struct nfs4_client *clp)
249{
250 clp->cl_time = 0;
251}
252
253static inline bool
254is_client_expired(struct nfs4_client *clp)
255{
256 return clp->cl_time == 0;
257}
258
234/* struct nfs4_client_reset 259/* struct nfs4_client_reset
235 * one per old client. Populates reset_str_hashtbl. Filled from conf_id_hashtbl 260 * one per old client. Populates reset_str_hashtbl. Filled from conf_id_hashtbl
236 * upon lease reset, or from upcall to state_daemon (to read in state 261 * upon lease reset, or from upcall to state_daemon (to read in state
@@ -377,11 +402,14 @@ extern void nfs4_lock_state(void);
377extern void nfs4_unlock_state(void); 402extern void nfs4_unlock_state(void);
378extern int nfs4_in_grace(void); 403extern int nfs4_in_grace(void);
379extern __be32 nfs4_check_open_reclaim(clientid_t *clid); 404extern __be32 nfs4_check_open_reclaim(clientid_t *clid);
380extern void put_nfs4_client(struct nfs4_client *clp);
381extern void nfs4_free_stateowner(struct kref *kref); 405extern void nfs4_free_stateowner(struct kref *kref);
382extern int set_callback_cred(void); 406extern int set_callback_cred(void);
383extern void nfsd4_probe_callback(struct nfs4_client *clp); 407extern void nfsd4_probe_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);
408extern void nfsd4_do_callback_rpc(struct work_struct *);
384extern void nfsd4_cb_recall(struct nfs4_delegation *dp); 409extern void nfsd4_cb_recall(struct nfs4_delegation *dp);
410extern int nfsd4_create_callback_queue(void);
411extern void nfsd4_destroy_callback_queue(void);
412extern void nfsd4_set_callback_client(struct nfs4_client *, struct rpc_clnt *);
385extern void nfs4_put_delegation(struct nfs4_delegation *dp); 413extern void nfs4_put_delegation(struct nfs4_delegation *dp);
386extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname); 414extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname);
387extern void nfsd4_init_recdir(char *recdir_name); 415extern void nfsd4_init_recdir(char *recdir_name);
@@ -392,6 +420,7 @@ extern int nfs4_has_reclaimed_state(const char *name, bool use_exchange_id);
392extern void nfsd4_recdir_purge_old(void); 420extern void nfsd4_recdir_purge_old(void);
393extern int nfsd4_create_clid_dir(struct nfs4_client *clp); 421extern int nfsd4_create_clid_dir(struct nfs4_client *clp);
394extern void nfsd4_remove_clid_dir(struct nfs4_client *clp); 422extern void nfsd4_remove_clid_dir(struct nfs4_client *clp);
423extern void release_session_client(struct nfsd4_session *);
395 424
396static inline void 425static inline void
397nfs4_put_stateowner(struct nfs4_stateowner *so) 426nfs4_put_stateowner(struct nfs4_stateowner *so)
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index a11b0e8678ee..ebbf3b6b2457 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -25,6 +25,7 @@
25#include <linux/xattr.h> 25#include <linux/xattr.h>
26#include <linux/jhash.h> 26#include <linux/jhash.h>
27#include <linux/ima.h> 27#include <linux/ima.h>
28#include <linux/slab.h>
28#include <asm/uaccess.h> 29#include <asm/uaccess.h>
29#include <linux/exportfs.h> 30#include <linux/exportfs.h>
30#include <linux/writeback.h> 31#include <linux/writeback.h>
@@ -723,7 +724,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
723 struct inode *inode; 724 struct inode *inode;
724 int flags = O_RDONLY|O_LARGEFILE; 725 int flags = O_RDONLY|O_LARGEFILE;
725 __be32 err; 726 __be32 err;
726 int host_err; 727 int host_err = 0;
727 728
728 validate_process_creds(); 729 validate_process_creds();
729 730
@@ -760,7 +761,8 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
760 * Check to see if there are any leases on this file. 761 * Check to see if there are any leases on this file.
761 * This may block while leases are broken. 762 * This may block while leases are broken.
762 */ 763 */
763 host_err = break_lease(inode, O_NONBLOCK | ((access & NFSD_MAY_WRITE) ? O_WRONLY : 0)); 764 if (!(access & NFSD_MAY_NOT_BREAK_LEASE))
765 host_err = break_lease(inode, O_NONBLOCK | ((access & NFSD_MAY_WRITE) ? O_WRONLY : 0));
764 if (host_err == -EWOULDBLOCK) 766 if (host_err == -EWOULDBLOCK)
765 host_err = -ETIMEDOUT; 767 host_err = -ETIMEDOUT;
766 if (host_err) /* NOMEM or WOULDBLOCK */ 768 if (host_err) /* NOMEM or WOULDBLOCK */
@@ -997,7 +999,7 @@ static int wait_for_concurrent_writes(struct file *file)
997 999
998 if (inode->i_state & I_DIRTY) { 1000 if (inode->i_state & I_DIRTY) {
999 dprintk("nfsd: write sync %d\n", task_pid_nr(current)); 1001 dprintk("nfsd: write sync %d\n", task_pid_nr(current));
1000 err = vfs_fsync(file, file->f_path.dentry, 0); 1002 err = vfs_fsync(file, 0);
1001 } 1003 }
1002 last_ino = inode->i_ino; 1004 last_ino = inode->i_ino;
1003 last_dev = inode->i_sb->s_dev; 1005 last_dev = inode->i_sb->s_dev;
@@ -1168,12 +1170,12 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp,
1168 goto out; 1170 goto out;
1169 } 1171 }
1170 1172
1171 err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file); 1173 err = nfsd_open(rqstp, fhp, S_IFREG,
1174 NFSD_MAY_WRITE|NFSD_MAY_NOT_BREAK_LEASE, &file);
1172 if (err) 1175 if (err)
1173 goto out; 1176 goto out;
1174 if (EX_ISSYNC(fhp->fh_export)) { 1177 if (EX_ISSYNC(fhp->fh_export)) {
1175 int err2 = vfs_fsync_range(file, file->f_path.dentry, 1178 int err2 = vfs_fsync_range(file, offset, end, 0);
1176 offset, end, 0);
1177 1179
1178 if (err2 != -EINVAL) 1180 if (err2 != -EINVAL)
1179 err = nfserrno(err2); 1181 err = nfserrno(err2);
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 4b1de0a9ea75..217a62c2a357 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -20,6 +20,7 @@
20#define NFSD_MAY_OWNER_OVERRIDE 64 20#define NFSD_MAY_OWNER_OVERRIDE 64
21#define NFSD_MAY_LOCAL_ACCESS 128 /* IRIX doing local access check on device special file*/ 21#define NFSD_MAY_LOCAL_ACCESS 128 /* IRIX doing local access check on device special file*/
22#define NFSD_MAY_BYPASS_GSS_ON_ROOT 256 22#define NFSD_MAY_BYPASS_GSS_ON_ROOT 256
23#define NFSD_MAY_NOT_BREAK_LEASE 512
23 24
24#define NFSD_MAY_CREATE (NFSD_MAY_EXEC|NFSD_MAY_WRITE) 25#define NFSD_MAY_CREATE (NFSD_MAY_EXEC|NFSD_MAY_WRITE)
25#define NFSD_MAY_REMOVE (NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC) 26#define NFSD_MAY_REMOVE (NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC)
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index efa337739534..4d476ff08ae6 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -381,6 +381,10 @@ struct nfsd4_destroy_session {
381 struct nfs4_sessionid sessionid; 381 struct nfs4_sessionid sessionid;
382}; 382};
383 383
384struct nfsd4_reclaim_complete {
385 u32 rca_one_fs;
386};
387
384struct nfsd4_op { 388struct nfsd4_op {
385 int opnum; 389 int opnum;
386 __be32 status; 390 __be32 status;
@@ -421,6 +425,7 @@ struct nfsd4_op {
421 struct nfsd4_create_session create_session; 425 struct nfsd4_create_session create_session;
422 struct nfsd4_destroy_session destroy_session; 426 struct nfsd4_destroy_session destroy_session;
423 struct nfsd4_sequence sequence; 427 struct nfsd4_sequence sequence;
428 struct nfsd4_reclaim_complete reclaim_complete;
424 } u; 429 } u;
425 struct nfs4_replay * replay; 430 struct nfs4_replay * replay;
426}; 431};
@@ -513,9 +518,8 @@ extern void nfsd4_store_cache_entry(struct nfsd4_compoundres *resp);
513extern __be32 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp, 518extern __be32 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
514 struct nfsd4_sequence *seq); 519 struct nfsd4_sequence *seq);
515extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp, 520extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp,
516 struct nfsd4_compound_state *, 521 struct nfsd4_compound_state *, struct nfsd4_exchange_id *);
517struct nfsd4_exchange_id *); 522extern __be32 nfsd4_create_session(struct svc_rqst *,
518 extern __be32 nfsd4_create_session(struct svc_rqst *,
519 struct nfsd4_compound_state *, 523 struct nfsd4_compound_state *,
520 struct nfsd4_create_session *); 524 struct nfsd4_create_session *);
521extern __be32 nfsd4_sequence(struct svc_rqst *, 525extern __be32 nfsd4_sequence(struct svc_rqst *,
@@ -524,6 +528,7 @@ extern __be32 nfsd4_sequence(struct svc_rqst *,
524extern __be32 nfsd4_destroy_session(struct svc_rqst *, 528extern __be32 nfsd4_destroy_session(struct svc_rqst *,
525 struct nfsd4_compound_state *, 529 struct nfsd4_compound_state *,
526 struct nfsd4_destroy_session *); 530 struct nfsd4_destroy_session *);
531__be32 nfsd4_reclaim_complete(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_reclaim_complete *);
527extern __be32 nfsd4_process_open1(struct nfsd4_compound_state *, 532extern __be32 nfsd4_process_open1(struct nfsd4_compound_state *,
528 struct nfsd4_open *open); 533 struct nfsd4_open *open);
529extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp, 534extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp,
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index 3f959f1879d8..d7fd696e595c 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -26,10 +26,16 @@
26#include <linux/buffer_head.h> 26#include <linux/buffer_head.h>
27#include <linux/fs.h> 27#include <linux/fs.h>
28#include <linux/bitops.h> 28#include <linux/bitops.h>
29#include <linux/slab.h>
29#include "mdt.h" 30#include "mdt.h"
30#include "alloc.h" 31#include "alloc.h"
31 32
32 33
34/**
35 * nilfs_palloc_groups_per_desc_block - get the number of groups that a group
36 * descriptor block can maintain
37 * @inode: inode of metadata file using this allocator
38 */
33static inline unsigned long 39static inline unsigned long
34nilfs_palloc_groups_per_desc_block(const struct inode *inode) 40nilfs_palloc_groups_per_desc_block(const struct inode *inode)
35{ 41{
@@ -37,12 +43,21 @@ nilfs_palloc_groups_per_desc_block(const struct inode *inode)
37 sizeof(struct nilfs_palloc_group_desc); 43 sizeof(struct nilfs_palloc_group_desc);
38} 44}
39 45
46/**
47 * nilfs_palloc_groups_count - get maximum number of groups
48 * @inode: inode of metadata file using this allocator
49 */
40static inline unsigned long 50static inline unsigned long
41nilfs_palloc_groups_count(const struct inode *inode) 51nilfs_palloc_groups_count(const struct inode *inode)
42{ 52{
43 return 1UL << (BITS_PER_LONG - (inode->i_blkbits + 3 /* log2(8) */)); 53 return 1UL << (BITS_PER_LONG - (inode->i_blkbits + 3 /* log2(8) */));
44} 54}
45 55
56/**
57 * nilfs_palloc_init_blockgroup - initialize private variables for allocator
58 * @inode: inode of metadata file using this allocator
59 * @entry_size: size of the persistent object
60 */
46int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned entry_size) 61int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned entry_size)
47{ 62{
48 struct nilfs_mdt_info *mi = NILFS_MDT(inode); 63 struct nilfs_mdt_info *mi = NILFS_MDT(inode);
@@ -68,6 +83,12 @@ int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned entry_size)
68 return 0; 83 return 0;
69} 84}
70 85
86/**
87 * nilfs_palloc_group - get group number and offset from an entry number
88 * @inode: inode of metadata file using this allocator
89 * @nr: serial number of the entry (e.g. inode number)
90 * @offset: pointer to store offset number in the group
91 */
71static unsigned long nilfs_palloc_group(const struct inode *inode, __u64 nr, 92static unsigned long nilfs_palloc_group(const struct inode *inode, __u64 nr,
72 unsigned long *offset) 93 unsigned long *offset)
73{ 94{
@@ -77,6 +98,14 @@ static unsigned long nilfs_palloc_group(const struct inode *inode, __u64 nr,
77 return group; 98 return group;
78} 99}
79 100
101/**
102 * nilfs_palloc_desc_blkoff - get block offset of a group descriptor block
103 * @inode: inode of metadata file using this allocator
104 * @group: group number
105 *
106 * nilfs_palloc_desc_blkoff() returns block offset of the descriptor
107 * block which contains a descriptor of the specified group.
108 */
80static unsigned long 109static unsigned long
81nilfs_palloc_desc_blkoff(const struct inode *inode, unsigned long group) 110nilfs_palloc_desc_blkoff(const struct inode *inode, unsigned long group)
82{ 111{
@@ -85,6 +114,14 @@ nilfs_palloc_desc_blkoff(const struct inode *inode, unsigned long group)
85 return desc_block * NILFS_MDT(inode)->mi_blocks_per_desc_block; 114 return desc_block * NILFS_MDT(inode)->mi_blocks_per_desc_block;
86} 115}
87 116
117/**
118 * nilfs_palloc_bitmap_blkoff - get block offset of a bitmap block
119 * @inode: inode of metadata file using this allocator
120 * @group: group number
121 *
122 * nilfs_palloc_bitmap_blkoff() returns block offset of the bitmap
123 * block used to allocate/deallocate entries in the specified group.
124 */
88static unsigned long 125static unsigned long
89nilfs_palloc_bitmap_blkoff(const struct inode *inode, unsigned long group) 126nilfs_palloc_bitmap_blkoff(const struct inode *inode, unsigned long group)
90{ 127{
@@ -94,6 +131,12 @@ nilfs_palloc_bitmap_blkoff(const struct inode *inode, unsigned long group)
94 desc_offset * NILFS_MDT(inode)->mi_blocks_per_group; 131 desc_offset * NILFS_MDT(inode)->mi_blocks_per_group;
95} 132}
96 133
134/**
135 * nilfs_palloc_group_desc_nfrees - get the number of free entries in a group
136 * @inode: inode of metadata file using this allocator
137 * @group: group number
138 * @desc: pointer to descriptor structure for the group
139 */
97static unsigned long 140static unsigned long
98nilfs_palloc_group_desc_nfrees(struct inode *inode, unsigned long group, 141nilfs_palloc_group_desc_nfrees(struct inode *inode, unsigned long group,
99 const struct nilfs_palloc_group_desc *desc) 142 const struct nilfs_palloc_group_desc *desc)
@@ -106,6 +149,13 @@ nilfs_palloc_group_desc_nfrees(struct inode *inode, unsigned long group,
106 return nfree; 149 return nfree;
107} 150}
108 151
152/**
153 * nilfs_palloc_group_desc_add_entries - adjust count of free entries
154 * @inode: inode of metadata file using this allocator
155 * @group: group number
156 * @desc: pointer to descriptor structure for the group
157 * @n: delta to be added
158 */
109static void 159static void
110nilfs_palloc_group_desc_add_entries(struct inode *inode, 160nilfs_palloc_group_desc_add_entries(struct inode *inode,
111 unsigned long group, 161 unsigned long group,
@@ -117,6 +167,11 @@ nilfs_palloc_group_desc_add_entries(struct inode *inode,
117 spin_unlock(nilfs_mdt_bgl_lock(inode, group)); 167 spin_unlock(nilfs_mdt_bgl_lock(inode, group));
118} 168}
119 169
170/**
171 * nilfs_palloc_entry_blkoff - get block offset of an entry block
172 * @inode: inode of metadata file using this allocator
173 * @nr: serial number of the entry (e.g. inode number)
174 */
120static unsigned long 175static unsigned long
121nilfs_palloc_entry_blkoff(const struct inode *inode, __u64 nr) 176nilfs_palloc_entry_blkoff(const struct inode *inode, __u64 nr)
122{ 177{
@@ -128,6 +183,12 @@ nilfs_palloc_entry_blkoff(const struct inode *inode, __u64 nr)
128 group_offset / NILFS_MDT(inode)->mi_entries_per_block; 183 group_offset / NILFS_MDT(inode)->mi_entries_per_block;
129} 184}
130 185
186/**
187 * nilfs_palloc_desc_block_init - initialize buffer of a group descriptor block
188 * @inode: inode of metadata file
189 * @bh: buffer head of the buffer to be initialized
190 * @kaddr: kernel address mapped for the page including the buffer
191 */
131static void nilfs_palloc_desc_block_init(struct inode *inode, 192static void nilfs_palloc_desc_block_init(struct inode *inode,
132 struct buffer_head *bh, void *kaddr) 193 struct buffer_head *bh, void *kaddr)
133{ 194{
@@ -178,6 +239,13 @@ static int nilfs_palloc_get_block(struct inode *inode, unsigned long blkoff,
178 return ret; 239 return ret;
179} 240}
180 241
242/**
243 * nilfs_palloc_get_desc_block - get buffer head of a group descriptor block
244 * @inode: inode of metadata file using this allocator
245 * @group: group number
246 * @create: create flag
247 * @bhp: pointer to store the resultant buffer head
248 */
181static int nilfs_palloc_get_desc_block(struct inode *inode, 249static int nilfs_palloc_get_desc_block(struct inode *inode,
182 unsigned long group, 250 unsigned long group,
183 int create, struct buffer_head **bhp) 251 int create, struct buffer_head **bhp)
@@ -190,6 +258,13 @@ static int nilfs_palloc_get_desc_block(struct inode *inode,
190 bhp, &cache->prev_desc, &cache->lock); 258 bhp, &cache->prev_desc, &cache->lock);
191} 259}
192 260
261/**
262 * nilfs_palloc_get_bitmap_block - get buffer head of a bitmap block
263 * @inode: inode of metadata file using this allocator
264 * @group: group number
265 * @create: create flag
266 * @bhp: pointer to store the resultant buffer head
267 */
193static int nilfs_palloc_get_bitmap_block(struct inode *inode, 268static int nilfs_palloc_get_bitmap_block(struct inode *inode,
194 unsigned long group, 269 unsigned long group,
195 int create, struct buffer_head **bhp) 270 int create, struct buffer_head **bhp)
@@ -202,6 +277,13 @@ static int nilfs_palloc_get_bitmap_block(struct inode *inode,
202 &cache->prev_bitmap, &cache->lock); 277 &cache->prev_bitmap, &cache->lock);
203} 278}
204 279
280/**
281 * nilfs_palloc_get_entry_block - get buffer head of an entry block
282 * @inode: inode of metadata file using this allocator
283 * @nr: serial number of the entry (e.g. inode number)
284 * @create: create flag
285 * @bhp: pointer to store the resultant buffer head
286 */
205int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr, 287int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr,
206 int create, struct buffer_head **bhp) 288 int create, struct buffer_head **bhp)
207{ 289{
@@ -213,6 +295,13 @@ int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr,
213 &cache->prev_entry, &cache->lock); 295 &cache->prev_entry, &cache->lock);
214} 296}
215 297
298/**
299 * nilfs_palloc_block_get_group_desc - get kernel address of a group descriptor
300 * @inode: inode of metadata file using this allocator
301 * @group: group number
302 * @bh: buffer head of the buffer storing the group descriptor block
303 * @kaddr: kernel address mapped for the page including the buffer
304 */
216static struct nilfs_palloc_group_desc * 305static struct nilfs_palloc_group_desc *
217nilfs_palloc_block_get_group_desc(const struct inode *inode, 306nilfs_palloc_block_get_group_desc(const struct inode *inode,
218 unsigned long group, 307 unsigned long group,
@@ -222,6 +311,13 @@ nilfs_palloc_block_get_group_desc(const struct inode *inode,
222 group % nilfs_palloc_groups_per_desc_block(inode); 311 group % nilfs_palloc_groups_per_desc_block(inode);
223} 312}
224 313
314/**
315 * nilfs_palloc_block_get_entry - get kernel address of an entry
316 * @inode: inode of metadata file using this allocator
317 * @nr: serial number of the entry (e.g. inode number)
318 * @bh: buffer head of the buffer storing the entry block
319 * @kaddr: kernel address mapped for the page including the buffer
320 */
225void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr, 321void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr,
226 const struct buffer_head *bh, void *kaddr) 322 const struct buffer_head *bh, void *kaddr)
227{ 323{
@@ -234,11 +330,19 @@ void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr,
234 entry_offset * NILFS_MDT(inode)->mi_entry_size; 330 entry_offset * NILFS_MDT(inode)->mi_entry_size;
235} 331}
236 332
333/**
334 * nilfs_palloc_find_available_slot - find available slot in a group
335 * @inode: inode of metadata file using this allocator
336 * @group: group number
337 * @target: offset number of an entry in the group (start point)
338 * @bitmap: bitmap of the group
339 * @bsize: size in bits
340 */
237static int nilfs_palloc_find_available_slot(struct inode *inode, 341static int nilfs_palloc_find_available_slot(struct inode *inode,
238 unsigned long group, 342 unsigned long group,
239 unsigned long target, 343 unsigned long target,
240 unsigned char *bitmap, 344 unsigned char *bitmap,
241 int bsize) /* size in bits */ 345 int bsize)
242{ 346{
243 int curr, pos, end, i; 347 int curr, pos, end, i;
244 348
@@ -276,6 +380,13 @@ static int nilfs_palloc_find_available_slot(struct inode *inode,
276 return -ENOSPC; 380 return -ENOSPC;
277} 381}
278 382
383/**
384 * nilfs_palloc_rest_groups_in_desc_block - get the remaining number of groups
385 * in a group descriptor block
386 * @inode: inode of metadata file using this allocator
387 * @curr: current group number
388 * @max: maximum number of groups
389 */
279static unsigned long 390static unsigned long
280nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode, 391nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode,
281 unsigned long curr, unsigned long max) 392 unsigned long curr, unsigned long max)
@@ -286,6 +397,11 @@ nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode,
286 max - curr + 1); 397 max - curr + 1);
287} 398}
288 399
400/**
401 * nilfs_palloc_prepare_alloc_entry - prepare to allocate a persistent object
402 * @inode: inode of metadata file using this allocator
403 * @req: nilfs_palloc_req structure exchanged for the allocation
404 */
289int nilfs_palloc_prepare_alloc_entry(struct inode *inode, 405int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
290 struct nilfs_palloc_req *req) 406 struct nilfs_palloc_req *req)
291{ 407{
@@ -365,6 +481,11 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
365 return ret; 481 return ret;
366} 482}
367 483
484/**
485 * nilfs_palloc_commit_alloc_entry - finish allocation of a persistent object
486 * @inode: inode of metadata file using this allocator
487 * @req: nilfs_palloc_req structure exchanged for the allocation
488 */
368void nilfs_palloc_commit_alloc_entry(struct inode *inode, 489void nilfs_palloc_commit_alloc_entry(struct inode *inode,
369 struct nilfs_palloc_req *req) 490 struct nilfs_palloc_req *req)
370{ 491{
@@ -376,6 +497,11 @@ void nilfs_palloc_commit_alloc_entry(struct inode *inode,
376 brelse(req->pr_desc_bh); 497 brelse(req->pr_desc_bh);
377} 498}
378 499
500/**
501 * nilfs_palloc_commit_free_entry - finish deallocating a persistent object
502 * @inode: inode of metadata file using this allocator
503 * @req: nilfs_palloc_req structure exchanged for the removal
504 */
379void nilfs_palloc_commit_free_entry(struct inode *inode, 505void nilfs_palloc_commit_free_entry(struct inode *inode,
380 struct nilfs_palloc_req *req) 506 struct nilfs_palloc_req *req)
381{ 507{
@@ -409,6 +535,11 @@ void nilfs_palloc_commit_free_entry(struct inode *inode,
409 brelse(req->pr_desc_bh); 535 brelse(req->pr_desc_bh);
410} 536}
411 537
538/**
539 * nilfs_palloc_abort_alloc_entry - cancel allocation of a persistent object
540 * @inode: inode of metadata file using this allocator
541 * @req: nilfs_palloc_req structure exchanged for the allocation
542 */
412void nilfs_palloc_abort_alloc_entry(struct inode *inode, 543void nilfs_palloc_abort_alloc_entry(struct inode *inode,
413 struct nilfs_palloc_req *req) 544 struct nilfs_palloc_req *req)
414{ 545{
@@ -425,7 +556,7 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode,
425 bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh); 556 bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh);
426 if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group), 557 if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group),
427 group_offset, bitmap)) 558 group_offset, bitmap))
428 printk(KERN_WARNING "%s: entry numer %llu already freed\n", 559 printk(KERN_WARNING "%s: entry number %llu already freed\n",
429 __func__, (unsigned long long)req->pr_entry_nr); 560 __func__, (unsigned long long)req->pr_entry_nr);
430 561
431 nilfs_palloc_group_desc_add_entries(inode, group, desc, 1); 562 nilfs_palloc_group_desc_add_entries(inode, group, desc, 1);
@@ -441,6 +572,11 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode,
441 req->pr_desc_bh = NULL; 572 req->pr_desc_bh = NULL;
442} 573}
443 574
575/**
576 * nilfs_palloc_prepare_free_entry - prepare to deallocate a persistent object
577 * @inode: inode of metadata file using this allocator
578 * @req: nilfs_palloc_req structure exchanged for the removal
579 */
444int nilfs_palloc_prepare_free_entry(struct inode *inode, 580int nilfs_palloc_prepare_free_entry(struct inode *inode,
445 struct nilfs_palloc_req *req) 581 struct nilfs_palloc_req *req)
446{ 582{
@@ -463,6 +599,11 @@ int nilfs_palloc_prepare_free_entry(struct inode *inode,
463 return 0; 599 return 0;
464} 600}
465 601
602/**
603 * nilfs_palloc_abort_free_entry - cancel deallocating a persistent object
604 * @inode: inode of metadata file using this allocator
605 * @req: nilfs_palloc_req structure exchanged for the removal
606 */
466void nilfs_palloc_abort_free_entry(struct inode *inode, 607void nilfs_palloc_abort_free_entry(struct inode *inode,
467 struct nilfs_palloc_req *req) 608 struct nilfs_palloc_req *req)
468{ 609{
@@ -474,6 +615,12 @@ void nilfs_palloc_abort_free_entry(struct inode *inode,
474 req->pr_desc_bh = NULL; 615 req->pr_desc_bh = NULL;
475} 616}
476 617
618/**
619 * nilfs_palloc_group_is_in - judge if an entry is in a group
620 * @inode: inode of metadata file using this allocator
621 * @group: group number
622 * @nr: serial number of the entry (e.g. inode number)
623 */
477static int 624static int
478nilfs_palloc_group_is_in(struct inode *inode, unsigned long group, __u64 nr) 625nilfs_palloc_group_is_in(struct inode *inode, unsigned long group, __u64 nr)
479{ 626{
@@ -484,6 +631,12 @@ nilfs_palloc_group_is_in(struct inode *inode, unsigned long group, __u64 nr)
484 return (nr >= first) && (nr <= last); 631 return (nr >= first) && (nr <= last);
485} 632}
486 633
634/**
635 * nilfs_palloc_freev - deallocate a set of persistent objects
636 * @inode: inode of metadata file using this allocator
637 * @entry_nrs: array of entry numbers to be deallocated
638 * @nitems: number of entries stored in @entry_nrs
639 */
487int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) 640int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
488{ 641{
489 struct buffer_head *desc_bh, *bitmap_bh; 642 struct buffer_head *desc_bh, *bitmap_bh;
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
index 5cccf874d692..9af34a7e6e13 100644
--- a/fs/nilfs2/alloc.h
+++ b/fs/nilfs2/alloc.h
@@ -29,6 +29,13 @@
29#include <linux/buffer_head.h> 29#include <linux/buffer_head.h>
30#include <linux/fs.h> 30#include <linux/fs.h>
31 31
32/**
33 * nilfs_palloc_entries_per_group - get the number of entries per group
34 * @inode: inode of metadata file using this allocator
35 *
36 * The number of entries per group is defined by the number of bits
37 * that a bitmap block can maintain.
38 */
32static inline unsigned long 39static inline unsigned long
33nilfs_palloc_entries_per_group(const struct inode *inode) 40nilfs_palloc_entries_per_group(const struct inode *inode)
34{ 41{
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 471e269536ae..447ce47a3306 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -27,6 +27,7 @@
27#include <linux/buffer_head.h> 27#include <linux/buffer_head.h>
28#include <linux/mm.h> 28#include <linux/mm.h>
29#include <linux/backing-dev.h> 29#include <linux/backing-dev.h>
30#include <linux/gfp.h>
30#include "nilfs.h" 31#include "nilfs.h"
31#include "mdt.h" 32#include "mdt.h"
32#include "dat.h" 33#include "dat.h"
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 7cdd98b8d514..b27a342c5af6 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -31,63 +31,16 @@
31#include "alloc.h" 31#include "alloc.h"
32#include "dat.h" 32#include "dat.h"
33 33
34/** 34static struct nilfs_btree_path *nilfs_btree_alloc_path(void)
35 * struct nilfs_btree_path - A path on which B-tree operations are executed
36 * @bp_bh: buffer head of node block
37 * @bp_sib_bh: buffer head of sibling node block
38 * @bp_index: index of child node
39 * @bp_oldreq: ptr end request for old ptr
40 * @bp_newreq: ptr alloc request for new ptr
41 * @bp_op: rebalance operation
42 */
43struct nilfs_btree_path {
44 struct buffer_head *bp_bh;
45 struct buffer_head *bp_sib_bh;
46 int bp_index;
47 union nilfs_bmap_ptr_req bp_oldreq;
48 union nilfs_bmap_ptr_req bp_newreq;
49 struct nilfs_btnode_chkey_ctxt bp_ctxt;
50 void (*bp_op)(struct nilfs_btree *, struct nilfs_btree_path *,
51 int, __u64 *, __u64 *);
52};
53
54/*
55 * B-tree path operations
56 */
57
58static struct kmem_cache *nilfs_btree_path_cache;
59
60int __init nilfs_btree_path_cache_init(void)
61{
62 nilfs_btree_path_cache =
63 kmem_cache_create("nilfs2_btree_path_cache",
64 sizeof(struct nilfs_btree_path) *
65 NILFS_BTREE_LEVEL_MAX, 0, 0, NULL);
66 return (nilfs_btree_path_cache != NULL) ? 0 : -ENOMEM;
67}
68
69void nilfs_btree_path_cache_destroy(void)
70{
71 kmem_cache_destroy(nilfs_btree_path_cache);
72}
73
74static inline struct nilfs_btree_path *nilfs_btree_alloc_path(void)
75{
76 return kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS);
77}
78
79static inline void nilfs_btree_free_path(struct nilfs_btree_path *path)
80{ 35{
81 kmem_cache_free(nilfs_btree_path_cache, path); 36 struct nilfs_btree_path *path;
82} 37 int level = NILFS_BTREE_LEVEL_DATA;
83 38
84static void nilfs_btree_init_path(struct nilfs_btree_path *path) 39 path = kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS);
85{ 40 if (path == NULL)
86 int level; 41 goto out;
87 42
88 for (level = NILFS_BTREE_LEVEL_DATA; 43 for (; level < NILFS_BTREE_LEVEL_MAX; level++) {
89 level < NILFS_BTREE_LEVEL_MAX;
90 level++) {
91 path[level].bp_bh = NULL; 44 path[level].bp_bh = NULL;
92 path[level].bp_sib_bh = NULL; 45 path[level].bp_sib_bh = NULL;
93 path[level].bp_index = 0; 46 path[level].bp_index = 0;
@@ -95,15 +48,19 @@ static void nilfs_btree_init_path(struct nilfs_btree_path *path)
95 path[level].bp_newreq.bpr_ptr = NILFS_BMAP_INVALID_PTR; 48 path[level].bp_newreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
96 path[level].bp_op = NULL; 49 path[level].bp_op = NULL;
97 } 50 }
51
52out:
53 return path;
98} 54}
99 55
100static void nilfs_btree_release_path(struct nilfs_btree_path *path) 56static void nilfs_btree_free_path(struct nilfs_btree_path *path)
101{ 57{
102 int level; 58 int level = NILFS_BTREE_LEVEL_DATA;
103 59
104 for (level = NILFS_BTREE_LEVEL_DATA; level < NILFS_BTREE_LEVEL_MAX; 60 for (; level < NILFS_BTREE_LEVEL_MAX; level++)
105 level++)
106 brelse(path[level].bp_bh); 61 brelse(path[level].bp_bh);
62
63 kmem_cache_free(nilfs_btree_path_cache, path);
107} 64}
108 65
109/* 66/*
@@ -566,14 +523,12 @@ static int nilfs_btree_lookup(const struct nilfs_bmap *bmap,
566 path = nilfs_btree_alloc_path(); 523 path = nilfs_btree_alloc_path();
567 if (path == NULL) 524 if (path == NULL)
568 return -ENOMEM; 525 return -ENOMEM;
569 nilfs_btree_init_path(path);
570 526
571 ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level); 527 ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level);
572 528
573 if (ptrp != NULL) 529 if (ptrp != NULL)
574 *ptrp = ptr; 530 *ptrp = ptr;
575 531
576 nilfs_btree_release_path(path);
577 nilfs_btree_free_path(path); 532 nilfs_btree_free_path(path);
578 533
579 return ret; 534 return ret;
@@ -594,7 +549,7 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
594 path = nilfs_btree_alloc_path(); 549 path = nilfs_btree_alloc_path();
595 if (path == NULL) 550 if (path == NULL)
596 return -ENOMEM; 551 return -ENOMEM;
597 nilfs_btree_init_path(path); 552
598 ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level); 553 ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level);
599 if (ret < 0) 554 if (ret < 0)
600 goto out; 555 goto out;
@@ -655,7 +610,6 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
655 *ptrp = ptr; 610 *ptrp = ptr;
656 ret = cnt; 611 ret = cnt;
657 out: 612 out:
658 nilfs_btree_release_path(path);
659 nilfs_btree_free_path(path); 613 nilfs_btree_free_path(path);
660 return ret; 614 return ret;
661} 615}
@@ -1123,7 +1077,6 @@ static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
1123 path = nilfs_btree_alloc_path(); 1077 path = nilfs_btree_alloc_path();
1124 if (path == NULL) 1078 if (path == NULL)
1125 return -ENOMEM; 1079 return -ENOMEM;
1126 nilfs_btree_init_path(path);
1127 1080
1128 ret = nilfs_btree_do_lookup(btree, path, key, NULL, 1081 ret = nilfs_btree_do_lookup(btree, path, key, NULL,
1129 NILFS_BTREE_LEVEL_NODE_MIN); 1082 NILFS_BTREE_LEVEL_NODE_MIN);
@@ -1140,7 +1093,6 @@ static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
1140 nilfs_bmap_add_blocks(bmap, stats.bs_nblocks); 1093 nilfs_bmap_add_blocks(bmap, stats.bs_nblocks);
1141 1094
1142 out: 1095 out:
1143 nilfs_btree_release_path(path);
1144 nilfs_btree_free_path(path); 1096 nilfs_btree_free_path(path);
1145 return ret; 1097 return ret;
1146} 1098}
@@ -1456,7 +1408,7 @@ static int nilfs_btree_delete(struct nilfs_bmap *bmap, __u64 key)
1456 path = nilfs_btree_alloc_path(); 1408 path = nilfs_btree_alloc_path();
1457 if (path == NULL) 1409 if (path == NULL)
1458 return -ENOMEM; 1410 return -ENOMEM;
1459 nilfs_btree_init_path(path); 1411
1460 ret = nilfs_btree_do_lookup(btree, path, key, NULL, 1412 ret = nilfs_btree_do_lookup(btree, path, key, NULL,
1461 NILFS_BTREE_LEVEL_NODE_MIN); 1413 NILFS_BTREE_LEVEL_NODE_MIN);
1462 if (ret < 0) 1414 if (ret < 0)
@@ -1473,7 +1425,6 @@ static int nilfs_btree_delete(struct nilfs_bmap *bmap, __u64 key)
1473 nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks); 1425 nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks);
1474 1426
1475out: 1427out:
1476 nilfs_btree_release_path(path);
1477 nilfs_btree_free_path(path); 1428 nilfs_btree_free_path(path);
1478 return ret; 1429 return ret;
1479} 1430}
@@ -1488,11 +1439,9 @@ static int nilfs_btree_last_key(const struct nilfs_bmap *bmap, __u64 *keyp)
1488 path = nilfs_btree_alloc_path(); 1439 path = nilfs_btree_alloc_path();
1489 if (path == NULL) 1440 if (path == NULL)
1490 return -ENOMEM; 1441 return -ENOMEM;
1491 nilfs_btree_init_path(path);
1492 1442
1493 ret = nilfs_btree_do_lookup_last(btree, path, keyp, NULL); 1443 ret = nilfs_btree_do_lookup_last(btree, path, keyp, NULL);
1494 1444
1495 nilfs_btree_release_path(path);
1496 nilfs_btree_free_path(path); 1445 nilfs_btree_free_path(path);
1497 1446
1498 return ret; 1447 return ret;
@@ -1879,7 +1828,7 @@ static int nilfs_btree_propagate_v(struct nilfs_btree *btree,
1879 struct nilfs_btree_path *path, 1828 struct nilfs_btree_path *path,
1880 int level, struct buffer_head *bh) 1829 int level, struct buffer_head *bh)
1881{ 1830{
1882 int maxlevel, ret; 1831 int maxlevel = 0, ret;
1883 struct nilfs_btree_node *parent; 1832 struct nilfs_btree_node *parent;
1884 struct inode *dat = nilfs_bmap_get_dat(&btree->bt_bmap); 1833 struct inode *dat = nilfs_bmap_get_dat(&btree->bt_bmap);
1885 __u64 ptr; 1834 __u64 ptr;
@@ -1923,7 +1872,6 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
1923 path = nilfs_btree_alloc_path(); 1872 path = nilfs_btree_alloc_path();
1924 if (path == NULL) 1873 if (path == NULL)
1925 return -ENOMEM; 1874 return -ENOMEM;
1926 nilfs_btree_init_path(path);
1927 1875
1928 if (buffer_nilfs_node(bh)) { 1876 if (buffer_nilfs_node(bh)) {
1929 node = (struct nilfs_btree_node *)bh->b_data; 1877 node = (struct nilfs_btree_node *)bh->b_data;
@@ -1947,7 +1895,6 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
1947 nilfs_btree_propagate_p(btree, path, level, bh); 1895 nilfs_btree_propagate_p(btree, path, level, bh);
1948 1896
1949 out: 1897 out:
1950 nilfs_btree_release_path(path);
1951 nilfs_btree_free_path(path); 1898 nilfs_btree_free_path(path);
1952 1899
1953 return ret; 1900 return ret;
@@ -2108,7 +2055,6 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap,
2108 path = nilfs_btree_alloc_path(); 2055 path = nilfs_btree_alloc_path();
2109 if (path == NULL) 2056 if (path == NULL)
2110 return -ENOMEM; 2057 return -ENOMEM;
2111 nilfs_btree_init_path(path);
2112 2058
2113 if (buffer_nilfs_node(*bh)) { 2059 if (buffer_nilfs_node(*bh)) {
2114 node = (struct nilfs_btree_node *)(*bh)->b_data; 2060 node = (struct nilfs_btree_node *)(*bh)->b_data;
@@ -2130,7 +2076,6 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap,
2130 nilfs_btree_assign_p(btree, path, level, bh, blocknr, binfo); 2076 nilfs_btree_assign_p(btree, path, level, bh, blocknr, binfo);
2131 2077
2132 out: 2078 out:
2133 nilfs_btree_release_path(path);
2134 nilfs_btree_free_path(path); 2079 nilfs_btree_free_path(path);
2135 2080
2136 return ret; 2081 return ret;
@@ -2175,7 +2120,6 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
2175 path = nilfs_btree_alloc_path(); 2120 path = nilfs_btree_alloc_path();
2176 if (path == NULL) 2121 if (path == NULL)
2177 return -ENOMEM; 2122 return -ENOMEM;
2178 nilfs_btree_init_path(path);
2179 2123
2180 ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level + 1); 2124 ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level + 1);
2181 if (ret < 0) { 2125 if (ret < 0) {
@@ -2195,7 +2139,6 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
2195 nilfs_bmap_set_dirty(&btree->bt_bmap); 2139 nilfs_bmap_set_dirty(&btree->bt_bmap);
2196 2140
2197 out: 2141 out:
2198 nilfs_btree_release_path(path);
2199 nilfs_btree_free_path(path); 2142 nilfs_btree_free_path(path);
2200 return ret; 2143 return ret;
2201} 2144}
diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h
index 4b82d84ade75..af638d59e3bf 100644
--- a/fs/nilfs2/btree.h
+++ b/fs/nilfs2/btree.h
@@ -30,9 +30,6 @@
30#include "btnode.h" 30#include "btnode.h"
31#include "bmap.h" 31#include "bmap.h"
32 32
33struct nilfs_btree;
34struct nilfs_btree_path;
35
36/** 33/**
37 * struct nilfs_btree - B-tree structure 34 * struct nilfs_btree - B-tree structure
38 * @bt_bmap: bmap base structure 35 * @bt_bmap: bmap base structure
@@ -41,6 +38,25 @@ struct nilfs_btree {
41 struct nilfs_bmap bt_bmap; 38 struct nilfs_bmap bt_bmap;
42}; 39};
43 40
41/**
42 * struct nilfs_btree_path - A path on which B-tree operations are executed
43 * @bp_bh: buffer head of node block
44 * @bp_sib_bh: buffer head of sibling node block
45 * @bp_index: index of child node
46 * @bp_oldreq: ptr end request for old ptr
47 * @bp_newreq: ptr alloc request for new ptr
48 * @bp_op: rebalance operation
49 */
50struct nilfs_btree_path {
51 struct buffer_head *bp_bh;
52 struct buffer_head *bp_sib_bh;
53 int bp_index;
54 union nilfs_bmap_ptr_req bp_oldreq;
55 union nilfs_bmap_ptr_req bp_newreq;
56 struct nilfs_btnode_chkey_ctxt bp_ctxt;
57 void (*bp_op)(struct nilfs_btree *, struct nilfs_btree_path *,
58 int, __u64 *, __u64 *);
59};
44 60
45#define NILFS_BTREE_ROOT_SIZE NILFS_BMAP_SIZE 61#define NILFS_BTREE_ROOT_SIZE NILFS_BMAP_SIZE
46#define NILFS_BTREE_ROOT_NCHILDREN_MAX \ 62#define NILFS_BTREE_ROOT_NCHILDREN_MAX \
@@ -57,6 +73,7 @@ struct nilfs_btree {
57#define NILFS_BTREE_KEY_MIN ((__u64)0) 73#define NILFS_BTREE_KEY_MIN ((__u64)0)
58#define NILFS_BTREE_KEY_MAX (~(__u64)0) 74#define NILFS_BTREE_KEY_MAX (~(__u64)0)
59 75
76extern struct kmem_cache *nilfs_btree_path_cache;
60 77
61int nilfs_btree_path_cache_init(void); 78int nilfs_btree_path_cache_init(void);
62void nilfs_btree_path_cache_destroy(void); 79void nilfs_btree_path_cache_destroy(void);
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 30292df443ce..c9a30d7ff6fc 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -27,7 +27,7 @@
27#include "nilfs.h" 27#include "nilfs.h"
28#include "segment.h" 28#include "segment.h"
29 29
30int nilfs_sync_file(struct file *file, struct dentry *dentry, int datasync) 30int nilfs_sync_file(struct file *file, int datasync)
31{ 31{
32 /* 32 /*
33 * Called from fsync() system call 33 * Called from fsync() system call
@@ -37,7 +37,7 @@ int nilfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
37 * This function should be implemented when the writeback function 37 * This function should be implemented when the writeback function
38 * will be implemented. 38 * will be implemented.
39 */ 39 */
40 struct inode *inode = dentry->d_inode; 40 struct inode *inode = file->f_mapping->host;
41 int err; 41 int err;
42 42
43 if (!nilfs_inode_dirty(inode)) 43 if (!nilfs_inode_dirty(inode))
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 8880a9e281e7..145f03cd7d3e 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -45,6 +45,7 @@
45#include <linux/buffer_head.h> 45#include <linux/buffer_head.h>
46#include <linux/mpage.h> 46#include <linux/mpage.h>
47#include <linux/hash.h> 47#include <linux/hash.h>
48#include <linux/slab.h>
48#include <linux/swap.h> 49#include <linux/swap.h>
49#include "nilfs.h" 50#include "nilfs.h"
50#include "page.h" 51#include "page.h"
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 7868cc122ac7..39e038ac8fcb 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -22,6 +22,7 @@
22 */ 22 */
23 23
24#include <linux/buffer_head.h> 24#include <linux/buffer_head.h>
25#include <linux/gfp.h>
25#include <linux/mpage.h> 26#include <linux/mpage.h>
26#include <linux/writeback.h> 27#include <linux/writeback.h>
27#include <linux/uio.h> 28#include <linux/uio.h>
@@ -279,16 +280,7 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
279 /* reference count of i_bh inherits from nilfs_mdt_read_block() */ 280 /* reference count of i_bh inherits from nilfs_mdt_read_block() */
280 281
281 atomic_inc(&sbi->s_inodes_count); 282 atomic_inc(&sbi->s_inodes_count);
282 283 inode_init_owner(inode, dir, mode);
283 inode->i_uid = current_fsuid();
284 if (dir->i_mode & S_ISGID) {
285 inode->i_gid = dir->i_gid;
286 if (S_ISDIR(mode))
287 mode |= S_ISGID;
288 } else
289 inode->i_gid = current_fsgid();
290
291 inode->i_mode = mode;
292 inode->i_ino = ino; 284 inode->i_ino = ino;
293 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 285 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
294 286
@@ -450,7 +442,7 @@ static int __nilfs_read_inode(struct super_block *sb, unsigned long ino,
450 inode->i_op = &nilfs_special_inode_operations; 442 inode->i_op = &nilfs_special_inode_operations;
451 init_special_inode( 443 init_special_inode(
452 inode, inode->i_mode, 444 inode, inode->i_mode,
453 new_decode_dev(le64_to_cpu(raw_inode->i_device_code))); 445 huge_decode_dev(le64_to_cpu(raw_inode->i_device_code)));
454 } 446 }
455 nilfs_ifile_unmap_inode(sbi->s_ifile, ino, bh); 447 nilfs_ifile_unmap_inode(sbi->s_ifile, ino, bh);
456 brelse(bh); 448 brelse(bh);
@@ -510,7 +502,7 @@ void nilfs_write_inode_common(struct inode *inode,
510 nilfs_bmap_write(ii->i_bmap, raw_inode); 502 nilfs_bmap_write(ii->i_bmap, raw_inode);
511 else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) 503 else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
512 raw_inode->i_device_code = 504 raw_inode->i_device_code =
513 cpu_to_le64(new_encode_dev(inode->i_rdev)); 505 cpu_to_le64(huge_encode_dev(inode->i_rdev));
514 /* When extending inode, nilfs->ns_inode_size should be checked 506 /* When extending inode, nilfs->ns_inode_size should be checked
515 for substitutions of appended fields */ 507 for substitutions of appended fields */
516} 508}
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 313d0a21da48..f90a33d9a5b0 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -23,6 +23,7 @@
23#include <linux/fs.h> 23#include <linux/fs.h>
24#include <linux/wait.h> 24#include <linux/wait.h>
25#include <linux/smp_lock.h> /* lock_kernel(), unlock_kernel() */ 25#include <linux/smp_lock.h> /* lock_kernel(), unlock_kernel() */
26#include <linux/slab.h>
26#include <linux/capability.h> /* capable() */ 27#include <linux/capability.h> /* capable() */
27#include <linux/uaccess.h> /* copy_from_user(), copy_to_user() */ 28#include <linux/uaccess.h> /* copy_from_user(), copy_to_user() */
28#include <linux/vmalloc.h> 29#include <linux/vmalloc.h>
@@ -648,7 +649,7 @@ static int nilfs_ioctl_get_info(struct inode *inode, struct file *filp,
648long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 649long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
649{ 650{
650 struct inode *inode = filp->f_dentry->d_inode; 651 struct inode *inode = filp->f_dentry->d_inode;
651 void __user *argp = (void * __user *)arg; 652 void __user *argp = (void __user *)arg;
652 653
653 switch (cmd) { 654 switch (cmd) {
654 case NILFS_IOCTL_CHANGE_CPMODE: 655 case NILFS_IOCTL_CHANGE_CPMODE:
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 06713ffcc7f2..024be8c35bb6 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -26,6 +26,7 @@
26#include <linux/writeback.h> 26#include <linux/writeback.h>
27#include <linux/backing-dev.h> 27#include <linux/backing-dev.h>
28#include <linux/swap.h> 28#include <linux/swap.h>
29#include <linux/slab.h>
29#include "nilfs.h" 30#include "nilfs.h"
30#include "segment.h" 31#include "segment.h"
31#include "page.h" 32#include "page.h"
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 8723e5bfd071..47d6d7928122 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -228,7 +228,7 @@ extern void nilfs_set_link(struct inode *, struct nilfs_dir_entry *,
228 struct page *, struct inode *); 228 struct page *, struct inode *);
229 229
230/* file.c */ 230/* file.c */
231extern int nilfs_sync_file(struct file *, struct dentry *, int); 231extern int nilfs_sync_file(struct file *, int);
232 232
233/* ioctl.c */ 233/* ioctl.c */
234long nilfs_ioctl(struct file *, unsigned int, unsigned long); 234long nilfs_ioctl(struct file *, unsigned int, unsigned long);
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index fc246dba112a..8de3e1e48130 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -29,6 +29,7 @@
29#include <linux/list.h> 29#include <linux/list.h>
30#include <linux/highmem.h> 30#include <linux/highmem.h>
31#include <linux/pagevec.h> 31#include <linux/pagevec.h>
32#include <linux/gfp.h>
32#include "nilfs.h" 33#include "nilfs.h"
33#include "page.h" 34#include "page.h"
34#include "mdt.h" 35#include "mdt.h"
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index 017bedc761a0..bae2a516b4ee 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -23,6 +23,7 @@
23#include <linux/buffer_head.h> 23#include <linux/buffer_head.h>
24#include <linux/blkdev.h> 24#include <linux/blkdev.h>
25#include <linux/swap.h> 25#include <linux/swap.h>
26#include <linux/slab.h>
26#include <linux/crc32.h> 27#include <linux/crc32.h>
27#include "nilfs.h" 28#include "nilfs.h"
28#include "segment.h" 29#include "segment.h"
@@ -104,6 +105,8 @@ static void store_segsum_info(struct nilfs_segsum_info *ssi,
104 105
105 ssi->nsumblk = DIV_ROUND_UP(ssi->sumbytes, blocksize); 106 ssi->nsumblk = DIV_ROUND_UP(ssi->sumbytes, blocksize);
106 ssi->nfileblk = ssi->nblocks - ssi->nsumblk - !!NILFS_SEG_HAS_SR(ssi); 107 ssi->nfileblk = ssi->nblocks - ssi->nsumblk - !!NILFS_SEG_HAS_SR(ssi);
108
109 /* need to verify ->ss_bytes field if read ->ss_cno */
107} 110}
108 111
109/** 112/**
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 636eaafd6ea2..2e6a2723b8fa 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -25,6 +25,7 @@
25#include <linux/writeback.h> 25#include <linux/writeback.h>
26#include <linux/crc32.h> 26#include <linux/crc32.h>
27#include <linux/backing-dev.h> 27#include <linux/backing-dev.h>
28#include <linux/slab.h>
28#include "page.h" 29#include "page.h"
29#include "segbuf.h" 30#include "segbuf.h"
30 31
@@ -39,35 +40,10 @@ struct nilfs_write_info {
39 sector_t blocknr; 40 sector_t blocknr;
40}; 41};
41 42
42
43static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf, 43static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
44 struct the_nilfs *nilfs); 44 struct the_nilfs *nilfs);
45static int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf); 45static int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf);
46 46
47
48static struct kmem_cache *nilfs_segbuf_cachep;
49
50static void nilfs_segbuf_init_once(void *obj)
51{
52 memset(obj, 0, sizeof(struct nilfs_segment_buffer));
53}
54
55int __init nilfs_init_segbuf_cache(void)
56{
57 nilfs_segbuf_cachep =
58 kmem_cache_create("nilfs2_segbuf_cache",
59 sizeof(struct nilfs_segment_buffer),
60 0, SLAB_RECLAIM_ACCOUNT,
61 nilfs_segbuf_init_once);
62
63 return (nilfs_segbuf_cachep == NULL) ? -ENOMEM : 0;
64}
65
66void nilfs_destroy_segbuf_cache(void)
67{
68 kmem_cache_destroy(nilfs_segbuf_cachep);
69}
70
71struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *sb) 47struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *sb)
72{ 48{
73 struct nilfs_segment_buffer *segbuf; 49 struct nilfs_segment_buffer *segbuf;
@@ -80,6 +56,7 @@ struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *sb)
80 INIT_LIST_HEAD(&segbuf->sb_list); 56 INIT_LIST_HEAD(&segbuf->sb_list);
81 INIT_LIST_HEAD(&segbuf->sb_segsum_buffers); 57 INIT_LIST_HEAD(&segbuf->sb_segsum_buffers);
82 INIT_LIST_HEAD(&segbuf->sb_payload_buffers); 58 INIT_LIST_HEAD(&segbuf->sb_payload_buffers);
59 segbuf->sb_super_root = NULL;
83 60
84 init_completion(&segbuf->sb_bio_event); 61 init_completion(&segbuf->sb_bio_event);
85 atomic_set(&segbuf->sb_err, 0); 62 atomic_set(&segbuf->sb_err, 0);
@@ -157,7 +134,7 @@ int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *segbuf,
157} 134}
158 135
159int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned flags, 136int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned flags,
160 time_t ctime) 137 time_t ctime, __u64 cno)
161{ 138{
162 int err; 139 int err;
163 140
@@ -170,6 +147,7 @@ int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned flags,
170 segbuf->sb_sum.sumbytes = sizeof(struct nilfs_segment_summary); 147 segbuf->sb_sum.sumbytes = sizeof(struct nilfs_segment_summary);
171 segbuf->sb_sum.nfinfo = segbuf->sb_sum.nfileblk = 0; 148 segbuf->sb_sum.nfinfo = segbuf->sb_sum.nfileblk = 0;
172 segbuf->sb_sum.ctime = ctime; 149 segbuf->sb_sum.ctime = ctime;
150 segbuf->sb_sum.cno = cno;
173 return 0; 151 return 0;
174} 152}
175 153
@@ -195,13 +173,14 @@ void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *segbuf)
195 raw_sum->ss_nfinfo = cpu_to_le32(segbuf->sb_sum.nfinfo); 173 raw_sum->ss_nfinfo = cpu_to_le32(segbuf->sb_sum.nfinfo);
196 raw_sum->ss_sumbytes = cpu_to_le32(segbuf->sb_sum.sumbytes); 174 raw_sum->ss_sumbytes = cpu_to_le32(segbuf->sb_sum.sumbytes);
197 raw_sum->ss_pad = 0; 175 raw_sum->ss_pad = 0;
176 raw_sum->ss_cno = cpu_to_le64(segbuf->sb_sum.cno);
198} 177}
199 178
200/* 179/*
201 * CRC calculation routines 180 * CRC calculation routines
202 */ 181 */
203void nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *segbuf, 182static void
204 u32 seed) 183nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *segbuf, u32 seed)
205{ 184{
206 struct buffer_head *bh; 185 struct buffer_head *bh;
207 struct nilfs_segment_summary *raw_sum; 186 struct nilfs_segment_summary *raw_sum;
@@ -228,8 +207,8 @@ void nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *segbuf,
228 raw_sum->ss_sumsum = cpu_to_le32(crc); 207 raw_sum->ss_sumsum = cpu_to_le32(crc);
229} 208}
230 209
231void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf, 210static void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf,
232 u32 seed) 211 u32 seed)
233{ 212{
234 struct buffer_head *bh; 213 struct buffer_head *bh;
235 struct nilfs_segment_summary *raw_sum; 214 struct nilfs_segment_summary *raw_sum;
@@ -255,6 +234,20 @@ void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf,
255 raw_sum->ss_datasum = cpu_to_le32(crc); 234 raw_sum->ss_datasum = cpu_to_le32(crc);
256} 235}
257 236
237static void
238nilfs_segbuf_fill_in_super_root_crc(struct nilfs_segment_buffer *segbuf,
239 u32 seed)
240{
241 struct nilfs_super_root *raw_sr;
242 u32 crc;
243
244 raw_sr = (struct nilfs_super_root *)segbuf->sb_super_root->b_data;
245 crc = crc32_le(seed,
246 (unsigned char *)raw_sr + sizeof(raw_sr->sr_sum),
247 NILFS_SR_BYTES - sizeof(raw_sr->sr_sum));
248 raw_sr->sr_sum = cpu_to_le32(crc);
249}
250
258static void nilfs_release_buffers(struct list_head *list) 251static void nilfs_release_buffers(struct list_head *list)
259{ 252{
260 struct buffer_head *bh, *n; 253 struct buffer_head *bh, *n;
@@ -281,6 +274,7 @@ static void nilfs_segbuf_clear(struct nilfs_segment_buffer *segbuf)
281{ 274{
282 nilfs_release_buffers(&segbuf->sb_segsum_buffers); 275 nilfs_release_buffers(&segbuf->sb_segsum_buffers);
283 nilfs_release_buffers(&segbuf->sb_payload_buffers); 276 nilfs_release_buffers(&segbuf->sb_payload_buffers);
277 segbuf->sb_super_root = NULL;
284} 278}
285 279
286/* 280/*
@@ -323,14 +317,31 @@ int nilfs_write_logs(struct list_head *logs, struct the_nilfs *nilfs)
323int nilfs_wait_on_logs(struct list_head *logs) 317int nilfs_wait_on_logs(struct list_head *logs)
324{ 318{
325 struct nilfs_segment_buffer *segbuf; 319 struct nilfs_segment_buffer *segbuf;
326 int err; 320 int err, ret = 0;
327 321
328 list_for_each_entry(segbuf, logs, sb_list) { 322 list_for_each_entry(segbuf, logs, sb_list) {
329 err = nilfs_segbuf_wait(segbuf); 323 err = nilfs_segbuf_wait(segbuf);
330 if (err) 324 if (err && !ret)
331 return err; 325 ret = err;
326 }
327 return ret;
328}
329
330/**
331 * nilfs_add_checksums_on_logs - add checksums on the logs
332 * @logs: list of segment buffers storing target logs
333 * @seed: checksum seed value
334 */
335void nilfs_add_checksums_on_logs(struct list_head *logs, u32 seed)
336{
337 struct nilfs_segment_buffer *segbuf;
338
339 list_for_each_entry(segbuf, logs, sb_list) {
340 if (segbuf->sb_super_root)
341 nilfs_segbuf_fill_in_super_root_crc(segbuf, seed);
342 nilfs_segbuf_fill_in_segsum_crc(segbuf, seed);
343 nilfs_segbuf_fill_in_data_crc(segbuf, seed);
332 } 344 }
333 return 0;
334} 345}
335 346
336/* 347/*
diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h
index 94dfd3517bc0..fdf1c3b6d673 100644
--- a/fs/nilfs2/segbuf.h
+++ b/fs/nilfs2/segbuf.h
@@ -37,6 +37,7 @@
37 * @sumbytes: Byte count of segment summary 37 * @sumbytes: Byte count of segment summary
38 * @nfileblk: Total number of file blocks 38 * @nfileblk: Total number of file blocks
39 * @seg_seq: Segment sequence number 39 * @seg_seq: Segment sequence number
40 * @cno: Checkpoint number
40 * @ctime: Creation time 41 * @ctime: Creation time
41 * @next: Block number of the next full segment 42 * @next: Block number of the next full segment
42 */ 43 */
@@ -48,6 +49,7 @@ struct nilfs_segsum_info {
48 unsigned long sumbytes; 49 unsigned long sumbytes;
49 unsigned long nfileblk; 50 unsigned long nfileblk;
50 u64 seg_seq; 51 u64 seg_seq;
52 __u64 cno;
51 time_t ctime; 53 time_t ctime;
52 sector_t next; 54 sector_t next;
53}; 55};
@@ -76,6 +78,7 @@ struct nilfs_segsum_info {
76 * @sb_rest_blocks: Number of residual blocks in the current segment 78 * @sb_rest_blocks: Number of residual blocks in the current segment
77 * @sb_segsum_buffers: List of buffers for segment summaries 79 * @sb_segsum_buffers: List of buffers for segment summaries
78 * @sb_payload_buffers: List of buffers for segment payload 80 * @sb_payload_buffers: List of buffers for segment payload
81 * @sb_super_root: Pointer to buffer storing a super root block (if exists)
79 * @sb_nbio: Number of flying bio requests 82 * @sb_nbio: Number of flying bio requests
80 * @sb_err: I/O error status 83 * @sb_err: I/O error status
81 * @sb_bio_event: Completion event of log writing 84 * @sb_bio_event: Completion event of log writing
@@ -95,6 +98,7 @@ struct nilfs_segment_buffer {
95 /* Buffers */ 98 /* Buffers */
96 struct list_head sb_segsum_buffers; 99 struct list_head sb_segsum_buffers;
97 struct list_head sb_payload_buffers; /* including super root */ 100 struct list_head sb_payload_buffers; /* including super root */
101 struct buffer_head *sb_super_root;
98 102
99 /* io status */ 103 /* io status */
100 int sb_nbio; 104 int sb_nbio;
@@ -121,6 +125,7 @@ struct nilfs_segment_buffer {
121 b_assoc_buffers)) 125 b_assoc_buffers))
122#define NILFS_SEGBUF_BH_IS_LAST(bh, head) ((bh)->b_assoc_buffers.next == head) 126#define NILFS_SEGBUF_BH_IS_LAST(bh, head) ((bh)->b_assoc_buffers.next == head)
123 127
128extern struct kmem_cache *nilfs_segbuf_cachep;
124 129
125int __init nilfs_init_segbuf_cache(void); 130int __init nilfs_init_segbuf_cache(void);
126void nilfs_destroy_segbuf_cache(void); 131void nilfs_destroy_segbuf_cache(void);
@@ -132,13 +137,11 @@ void nilfs_segbuf_map_cont(struct nilfs_segment_buffer *segbuf,
132 struct nilfs_segment_buffer *prev); 137 struct nilfs_segment_buffer *prev);
133void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *, __u64, 138void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *, __u64,
134 struct the_nilfs *); 139 struct the_nilfs *);
135int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned, time_t); 140int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned, time_t, __u64);
136int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *); 141int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *);
137int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *, 142int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *,
138 struct buffer_head **); 143 struct buffer_head **);
139void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *); 144void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *);
140void nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *, u32);
141void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *, u32);
142 145
143static inline void 146static inline void
144nilfs_segbuf_add_segsum_buffer(struct nilfs_segment_buffer *segbuf, 147nilfs_segbuf_add_segsum_buffer(struct nilfs_segment_buffer *segbuf,
@@ -171,6 +174,7 @@ void nilfs_truncate_logs(struct list_head *logs,
171 struct nilfs_segment_buffer *last); 174 struct nilfs_segment_buffer *last);
172int nilfs_write_logs(struct list_head *logs, struct the_nilfs *nilfs); 175int nilfs_write_logs(struct list_head *logs, struct the_nilfs *nilfs);
173int nilfs_wait_on_logs(struct list_head *logs); 176int nilfs_wait_on_logs(struct list_head *logs);
177void nilfs_add_checksums_on_logs(struct list_head *logs, u32 seed);
174 178
175static inline void nilfs_destroy_logs(struct list_head *logs) 179static inline void nilfs_destroy_logs(struct list_head *logs)
176{ 180{
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 69576a95e13f..c9201649cc49 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -32,6 +32,7 @@
32#include <linux/kthread.h> 32#include <linux/kthread.h>
33#include <linux/crc32.h> 33#include <linux/crc32.h>
34#include <linux/pagevec.h> 34#include <linux/pagevec.h>
35#include <linux/slab.h>
35#include "nilfs.h" 36#include "nilfs.h"
36#include "btnode.h" 37#include "btnode.h"
37#include "page.h" 38#include "page.h"
@@ -115,42 +116,6 @@ static void nilfs_dispose_list(struct nilfs_sb_info *, struct list_head *,
115#define nilfs_cnt32_lt(a, b) nilfs_cnt32_gt(b, a) 116#define nilfs_cnt32_lt(a, b) nilfs_cnt32_gt(b, a)
116#define nilfs_cnt32_le(a, b) nilfs_cnt32_ge(b, a) 117#define nilfs_cnt32_le(a, b) nilfs_cnt32_ge(b, a)
117 118
118/*
119 * Transaction
120 */
121static struct kmem_cache *nilfs_transaction_cachep;
122
123/**
124 * nilfs_init_transaction_cache - create a cache for nilfs_transaction_info
125 *
126 * nilfs_init_transaction_cache() creates a slab cache for the struct
127 * nilfs_transaction_info.
128 *
129 * Return Value: On success, it returns 0. On error, one of the following
130 * negative error code is returned.
131 *
132 * %-ENOMEM - Insufficient memory available.
133 */
134int nilfs_init_transaction_cache(void)
135{
136 nilfs_transaction_cachep =
137 kmem_cache_create("nilfs2_transaction_cache",
138 sizeof(struct nilfs_transaction_info),
139 0, SLAB_RECLAIM_ACCOUNT, NULL);
140 return (nilfs_transaction_cachep == NULL) ? -ENOMEM : 0;
141}
142
143/**
144 * nilfs_destroy_transaction_cache - destroy the cache for transaction info
145 *
146 * nilfs_destroy_transaction_cache() frees the slab cache for the struct
147 * nilfs_transaction_info.
148 */
149void nilfs_destroy_transaction_cache(void)
150{
151 kmem_cache_destroy(nilfs_transaction_cachep);
152}
153
154static int nilfs_prepare_segment_lock(struct nilfs_transaction_info *ti) 119static int nilfs_prepare_segment_lock(struct nilfs_transaction_info *ti)
155{ 120{
156 struct nilfs_transaction_info *cur_ti = current->journal_info; 121 struct nilfs_transaction_info *cur_ti = current->journal_info;
@@ -401,7 +366,8 @@ static int nilfs_segctor_reset_segment_buffer(struct nilfs_sc_info *sci)
401 366
402 if (nilfs_doing_gc()) 367 if (nilfs_doing_gc())
403 flags = NILFS_SS_GC; 368 flags = NILFS_SS_GC;
404 err = nilfs_segbuf_reset(segbuf, flags, sci->sc_seg_ctime); 369 err = nilfs_segbuf_reset(segbuf, flags, sci->sc_seg_ctime,
370 sci->sc_sbi->s_nilfs->ns_cno);
405 if (unlikely(err)) 371 if (unlikely(err))
406 return err; 372 return err;
407 373
@@ -434,7 +400,7 @@ static int nilfs_segctor_add_super_root(struct nilfs_sc_info *sci)
434 return err; 400 return err;
435 segbuf = sci->sc_curseg; 401 segbuf = sci->sc_curseg;
436 } 402 }
437 err = nilfs_segbuf_extend_payload(segbuf, &sci->sc_super_root); 403 err = nilfs_segbuf_extend_payload(segbuf, &segbuf->sb_super_root);
438 if (likely(!err)) 404 if (likely(!err))
439 segbuf->sb_sum.flags |= NILFS_SS_SR; 405 segbuf->sb_sum.flags |= NILFS_SS_SR;
440 return err; 406 return err;
@@ -598,7 +564,7 @@ static void nilfs_write_file_node_binfo(struct nilfs_sc_info *sci,
598 *vblocknr = binfo->bi_v.bi_vblocknr; 564 *vblocknr = binfo->bi_v.bi_vblocknr;
599} 565}
600 566
601struct nilfs_sc_operations nilfs_sc_file_ops = { 567static struct nilfs_sc_operations nilfs_sc_file_ops = {
602 .collect_data = nilfs_collect_file_data, 568 .collect_data = nilfs_collect_file_data,
603 .collect_node = nilfs_collect_file_node, 569 .collect_node = nilfs_collect_file_node,
604 .collect_bmap = nilfs_collect_file_bmap, 570 .collect_bmap = nilfs_collect_file_bmap,
@@ -648,7 +614,7 @@ static void nilfs_write_dat_node_binfo(struct nilfs_sc_info *sci,
648 *binfo_dat = binfo->bi_dat; 614 *binfo_dat = binfo->bi_dat;
649} 615}
650 616
651struct nilfs_sc_operations nilfs_sc_dat_ops = { 617static struct nilfs_sc_operations nilfs_sc_dat_ops = {
652 .collect_data = nilfs_collect_dat_data, 618 .collect_data = nilfs_collect_dat_data,
653 .collect_node = nilfs_collect_file_node, 619 .collect_node = nilfs_collect_file_node,
654 .collect_bmap = nilfs_collect_dat_bmap, 620 .collect_bmap = nilfs_collect_dat_bmap,
@@ -656,7 +622,7 @@ struct nilfs_sc_operations nilfs_sc_dat_ops = {
656 .write_node_binfo = nilfs_write_dat_node_binfo, 622 .write_node_binfo = nilfs_write_dat_node_binfo,
657}; 623};
658 624
659struct nilfs_sc_operations nilfs_sc_dsync_ops = { 625static struct nilfs_sc_operations nilfs_sc_dsync_ops = {
660 .collect_data = nilfs_collect_file_data, 626 .collect_data = nilfs_collect_file_data,
661 .collect_node = NULL, 627 .collect_node = NULL,
662 .collect_bmap = NULL, 628 .collect_bmap = NULL,
@@ -931,43 +897,16 @@ static void nilfs_segctor_fill_in_file_bmap(struct nilfs_sc_info *sci,
931 } 897 }
932} 898}
933 899
934/*
935 * CRC calculation routines
936 */
937static void nilfs_fill_in_super_root_crc(struct buffer_head *bh_sr, u32 seed)
938{
939 struct nilfs_super_root *raw_sr =
940 (struct nilfs_super_root *)bh_sr->b_data;
941 u32 crc;
942
943 crc = crc32_le(seed,
944 (unsigned char *)raw_sr + sizeof(raw_sr->sr_sum),
945 NILFS_SR_BYTES - sizeof(raw_sr->sr_sum));
946 raw_sr->sr_sum = cpu_to_le32(crc);
947}
948
949static void nilfs_segctor_fill_in_checksums(struct nilfs_sc_info *sci,
950 u32 seed)
951{
952 struct nilfs_segment_buffer *segbuf;
953
954 if (sci->sc_super_root)
955 nilfs_fill_in_super_root_crc(sci->sc_super_root, seed);
956
957 list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
958 nilfs_segbuf_fill_in_segsum_crc(segbuf, seed);
959 nilfs_segbuf_fill_in_data_crc(segbuf, seed);
960 }
961}
962
963static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci, 900static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci,
964 struct the_nilfs *nilfs) 901 struct the_nilfs *nilfs)
965{ 902{
966 struct buffer_head *bh_sr = sci->sc_super_root; 903 struct buffer_head *bh_sr;
967 struct nilfs_super_root *raw_sr = 904 struct nilfs_super_root *raw_sr;
968 (struct nilfs_super_root *)bh_sr->b_data;
969 unsigned isz = nilfs->ns_inode_size; 905 unsigned isz = nilfs->ns_inode_size;
970 906
907 bh_sr = NILFS_LAST_SEGBUF(&sci->sc_segbufs)->sb_super_root;
908 raw_sr = (struct nilfs_super_root *)bh_sr->b_data;
909
971 raw_sr->sr_bytes = cpu_to_le16(NILFS_SR_BYTES); 910 raw_sr->sr_bytes = cpu_to_le16(NILFS_SR_BYTES);
972 raw_sr->sr_nongc_ctime 911 raw_sr->sr_nongc_ctime
973 = cpu_to_le64(nilfs_doing_gc() ? 912 = cpu_to_le64(nilfs_doing_gc() ?
@@ -1490,7 +1429,6 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
1490 1429
1491 /* Collection retry loop */ 1430 /* Collection retry loop */
1492 for (;;) { 1431 for (;;) {
1493 sci->sc_super_root = NULL;
1494 sci->sc_nblk_this_inc = 0; 1432 sci->sc_nblk_this_inc = 0;
1495 sci->sc_curseg = NILFS_FIRST_SEGBUF(&sci->sc_segbufs); 1433 sci->sc_curseg = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
1496 1434
@@ -1510,6 +1448,12 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
1510 if (mode != SC_LSEG_SR || sci->sc_stage.scnt < NILFS_ST_CPFILE) 1448 if (mode != SC_LSEG_SR || sci->sc_stage.scnt < NILFS_ST_CPFILE)
1511 break; 1449 break;
1512 1450
1451 nilfs_clear_logs(&sci->sc_segbufs);
1452
1453 err = nilfs_segctor_extend_segments(sci, nilfs, nadd);
1454 if (unlikely(err))
1455 return err;
1456
1513 if (sci->sc_stage.flags & NILFS_CF_SUFREED) { 1457 if (sci->sc_stage.flags & NILFS_CF_SUFREED) {
1514 err = nilfs_sufile_cancel_freev(nilfs->ns_sufile, 1458 err = nilfs_sufile_cancel_freev(nilfs->ns_sufile,
1515 sci->sc_freesegs, 1459 sci->sc_freesegs,
@@ -1517,12 +1461,6 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
1517 NULL); 1461 NULL);
1518 WARN_ON(err); /* do not happen */ 1462 WARN_ON(err); /* do not happen */
1519 } 1463 }
1520 nilfs_clear_logs(&sci->sc_segbufs);
1521
1522 err = nilfs_segctor_extend_segments(sci, nilfs, nadd);
1523 if (unlikely(err))
1524 return err;
1525
1526 nadd = min_t(int, nadd << 1, SC_MAX_SEGDELTA); 1464 nadd = min_t(int, nadd << 1, SC_MAX_SEGDELTA);
1527 sci->sc_stage = prev_stage; 1465 sci->sc_stage = prev_stage;
1528 } 1466 }
@@ -1567,7 +1505,7 @@ nilfs_segctor_update_payload_blocknr(struct nilfs_sc_info *sci,
1567 ssp.offset = sizeof(struct nilfs_segment_summary); 1505 ssp.offset = sizeof(struct nilfs_segment_summary);
1568 1506
1569 list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) { 1507 list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) {
1570 if (bh == sci->sc_super_root) 1508 if (bh == segbuf->sb_super_root)
1571 break; 1509 break;
1572 if (!finfo) { 1510 if (!finfo) {
1573 finfo = nilfs_segctor_map_segsum_entry( 1511 finfo = nilfs_segctor_map_segsum_entry(
@@ -1728,7 +1666,7 @@ static int nilfs_segctor_prepare_write(struct nilfs_sc_info *sci,
1728 1666
1729 list_for_each_entry(bh, &segbuf->sb_payload_buffers, 1667 list_for_each_entry(bh, &segbuf->sb_payload_buffers,
1730 b_assoc_buffers) { 1668 b_assoc_buffers) {
1731 if (bh == sci->sc_super_root) { 1669 if (bh == segbuf->sb_super_root) {
1732 if (bh->b_page != bd_page) { 1670 if (bh->b_page != bd_page) {
1733 lock_page(bd_page); 1671 lock_page(bd_page);
1734 clear_page_dirty_for_io(bd_page); 1672 clear_page_dirty_for_io(bd_page);
@@ -1847,7 +1785,7 @@ static void nilfs_clear_copied_buffers(struct list_head *list, int err)
1847} 1785}
1848 1786
1849static void nilfs_abort_logs(struct list_head *logs, struct page *failed_page, 1787static void nilfs_abort_logs(struct list_head *logs, struct page *failed_page,
1850 struct buffer_head *bh_sr, int err) 1788 int err)
1851{ 1789{
1852 struct nilfs_segment_buffer *segbuf; 1790 struct nilfs_segment_buffer *segbuf;
1853 struct page *bd_page = NULL, *fs_page = NULL; 1791 struct page *bd_page = NULL, *fs_page = NULL;
@@ -1868,7 +1806,7 @@ static void nilfs_abort_logs(struct list_head *logs, struct page *failed_page,
1868 1806
1869 list_for_each_entry(bh, &segbuf->sb_payload_buffers, 1807 list_for_each_entry(bh, &segbuf->sb_payload_buffers,
1870 b_assoc_buffers) { 1808 b_assoc_buffers) {
1871 if (bh == bh_sr) { 1809 if (bh == segbuf->sb_super_root) {
1872 if (bh->b_page != bd_page) { 1810 if (bh->b_page != bd_page) {
1873 end_page_writeback(bd_page); 1811 end_page_writeback(bd_page);
1874 bd_page = bh->b_page; 1812 bd_page = bh->b_page;
@@ -1897,8 +1835,7 @@ static void nilfs_segctor_abort_construction(struct nilfs_sc_info *sci,
1897 1835
1898 list_splice_tail_init(&sci->sc_write_logs, &logs); 1836 list_splice_tail_init(&sci->sc_write_logs, &logs);
1899 ret = nilfs_wait_on_logs(&logs); 1837 ret = nilfs_wait_on_logs(&logs);
1900 if (ret) 1838 nilfs_abort_logs(&logs, NULL, ret ? : err);
1901 nilfs_abort_logs(&logs, NULL, sci->sc_super_root, ret);
1902 1839
1903 list_splice_tail_init(&sci->sc_segbufs, &logs); 1840 list_splice_tail_init(&sci->sc_segbufs, &logs);
1904 nilfs_cancel_segusage(&logs, nilfs->ns_sufile); 1841 nilfs_cancel_segusage(&logs, nilfs->ns_sufile);
@@ -1914,7 +1851,6 @@ static void nilfs_segctor_abort_construction(struct nilfs_sc_info *sci,
1914 } 1851 }
1915 1852
1916 nilfs_destroy_logs(&logs); 1853 nilfs_destroy_logs(&logs);
1917 sci->sc_super_root = NULL;
1918} 1854}
1919 1855
1920static void nilfs_set_next_segment(struct the_nilfs *nilfs, 1856static void nilfs_set_next_segment(struct the_nilfs *nilfs,
@@ -1933,7 +1869,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
1933 struct nilfs_segment_buffer *segbuf; 1869 struct nilfs_segment_buffer *segbuf;
1934 struct page *bd_page = NULL, *fs_page = NULL; 1870 struct page *bd_page = NULL, *fs_page = NULL;
1935 struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs; 1871 struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
1936 int update_sr = (sci->sc_super_root != NULL); 1872 int update_sr = false;
1937 1873
1938 list_for_each_entry(segbuf, &sci->sc_write_logs, sb_list) { 1874 list_for_each_entry(segbuf, &sci->sc_write_logs, sb_list) {
1939 struct buffer_head *bh; 1875 struct buffer_head *bh;
@@ -1964,11 +1900,12 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
1964 set_buffer_uptodate(bh); 1900 set_buffer_uptodate(bh);
1965 clear_buffer_dirty(bh); 1901 clear_buffer_dirty(bh);
1966 clear_buffer_nilfs_volatile(bh); 1902 clear_buffer_nilfs_volatile(bh);
1967 if (bh == sci->sc_super_root) { 1903 if (bh == segbuf->sb_super_root) {
1968 if (bh->b_page != bd_page) { 1904 if (bh->b_page != bd_page) {
1969 end_page_writeback(bd_page); 1905 end_page_writeback(bd_page);
1970 bd_page = bh->b_page; 1906 bd_page = bh->b_page;
1971 } 1907 }
1908 update_sr = true;
1972 break; 1909 break;
1973 } 1910 }
1974 if (bh->b_page != fs_page) { 1911 if (bh->b_page != fs_page) {
@@ -2115,7 +2052,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
2115 struct nilfs_sb_info *sbi = sci->sc_sbi; 2052 struct nilfs_sb_info *sbi = sci->sc_sbi;
2116 struct the_nilfs *nilfs = sbi->s_nilfs; 2053 struct the_nilfs *nilfs = sbi->s_nilfs;
2117 struct page *failed_page; 2054 struct page *failed_page;
2118 int err, has_sr = 0; 2055 int err;
2119 2056
2120 sci->sc_stage.scnt = NILFS_ST_INIT; 2057 sci->sc_stage.scnt = NILFS_ST_INIT;
2121 2058
@@ -2143,8 +2080,6 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
2143 if (unlikely(err)) 2080 if (unlikely(err))
2144 goto failed; 2081 goto failed;
2145 2082
2146 has_sr = (sci->sc_super_root != NULL);
2147
2148 /* Avoid empty segment */ 2083 /* Avoid empty segment */
2149 if (sci->sc_stage.scnt == NILFS_ST_DONE && 2084 if (sci->sc_stage.scnt == NILFS_ST_DONE &&
2150 NILFS_SEG_EMPTY(&sci->sc_curseg->sb_sum)) { 2085 NILFS_SEG_EMPTY(&sci->sc_curseg->sb_sum)) {
@@ -2159,7 +2094,8 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
2159 if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED) 2094 if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED)
2160 nilfs_segctor_fill_in_file_bmap(sci, sbi->s_ifile); 2095 nilfs_segctor_fill_in_file_bmap(sci, sbi->s_ifile);
2161 2096
2162 if (has_sr) { 2097 if (mode == SC_LSEG_SR &&
2098 sci->sc_stage.scnt >= NILFS_ST_CPFILE) {
2163 err = nilfs_segctor_fill_in_checkpoint(sci); 2099 err = nilfs_segctor_fill_in_checkpoint(sci);
2164 if (unlikely(err)) 2100 if (unlikely(err))
2165 goto failed_to_write; 2101 goto failed_to_write;
@@ -2171,11 +2107,12 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
2171 /* Write partial segments */ 2107 /* Write partial segments */
2172 err = nilfs_segctor_prepare_write(sci, &failed_page); 2108 err = nilfs_segctor_prepare_write(sci, &failed_page);
2173 if (err) { 2109 if (err) {
2174 nilfs_abort_logs(&sci->sc_segbufs, failed_page, 2110 nilfs_abort_logs(&sci->sc_segbufs, failed_page, err);
2175 sci->sc_super_root, err);
2176 goto failed_to_write; 2111 goto failed_to_write;
2177 } 2112 }
2178 nilfs_segctor_fill_in_checksums(sci, nilfs->ns_crc_seed); 2113
2114 nilfs_add_checksums_on_logs(&sci->sc_segbufs,
2115 nilfs->ns_crc_seed);
2179 2116
2180 err = nilfs_segctor_write(sci, nilfs); 2117 err = nilfs_segctor_write(sci, nilfs);
2181 if (unlikely(err)) 2118 if (unlikely(err))
@@ -2196,8 +2133,6 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
2196 } 2133 }
2197 } while (sci->sc_stage.scnt != NILFS_ST_DONE); 2134 } while (sci->sc_stage.scnt != NILFS_ST_DONE);
2198 2135
2199 sci->sc_super_root = NULL;
2200
2201 out: 2136 out:
2202 nilfs_segctor_check_out_files(sci, sbi); 2137 nilfs_segctor_check_out_files(sci, sbi);
2203 return err; 2138 return err;
@@ -2224,9 +2159,9 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
2224static void nilfs_segctor_start_timer(struct nilfs_sc_info *sci) 2159static void nilfs_segctor_start_timer(struct nilfs_sc_info *sci)
2225{ 2160{
2226 spin_lock(&sci->sc_state_lock); 2161 spin_lock(&sci->sc_state_lock);
2227 if (sci->sc_timer && !(sci->sc_state & NILFS_SEGCTOR_COMMIT)) { 2162 if (!(sci->sc_state & NILFS_SEGCTOR_COMMIT)) {
2228 sci->sc_timer->expires = jiffies + sci->sc_interval; 2163 sci->sc_timer.expires = jiffies + sci->sc_interval;
2229 add_timer(sci->sc_timer); 2164 add_timer(&sci->sc_timer);
2230 sci->sc_state |= NILFS_SEGCTOR_COMMIT; 2165 sci->sc_state |= NILFS_SEGCTOR_COMMIT;
2231 } 2166 }
2232 spin_unlock(&sci->sc_state_lock); 2167 spin_unlock(&sci->sc_state_lock);
@@ -2431,9 +2366,7 @@ static void nilfs_segctor_accept(struct nilfs_sc_info *sci)
2431 spin_lock(&sci->sc_state_lock); 2366 spin_lock(&sci->sc_state_lock);
2432 sci->sc_seq_accepted = sci->sc_seq_request; 2367 sci->sc_seq_accepted = sci->sc_seq_request;
2433 spin_unlock(&sci->sc_state_lock); 2368 spin_unlock(&sci->sc_state_lock);
2434 2369 del_timer_sync(&sci->sc_timer);
2435 if (sci->sc_timer)
2436 del_timer_sync(sci->sc_timer);
2437} 2370}
2438 2371
2439/** 2372/**
@@ -2459,9 +2392,9 @@ static void nilfs_segctor_notify(struct nilfs_sc_info *sci, int mode, int err)
2459 sci->sc_flush_request &= ~FLUSH_DAT_BIT; 2392 sci->sc_flush_request &= ~FLUSH_DAT_BIT;
2460 2393
2461 /* re-enable timer if checkpoint creation was not done */ 2394 /* re-enable timer if checkpoint creation was not done */
2462 if (sci->sc_timer && (sci->sc_state & NILFS_SEGCTOR_COMMIT) && 2395 if ((sci->sc_state & NILFS_SEGCTOR_COMMIT) &&
2463 time_before(jiffies, sci->sc_timer->expires)) 2396 time_before(jiffies, sci->sc_timer.expires))
2464 add_timer(sci->sc_timer); 2397 add_timer(&sci->sc_timer);
2465 } 2398 }
2466 spin_unlock(&sci->sc_state_lock); 2399 spin_unlock(&sci->sc_state_lock);
2467} 2400}
@@ -2640,13 +2573,10 @@ static int nilfs_segctor_thread(void *arg)
2640{ 2573{
2641 struct nilfs_sc_info *sci = (struct nilfs_sc_info *)arg; 2574 struct nilfs_sc_info *sci = (struct nilfs_sc_info *)arg;
2642 struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs; 2575 struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
2643 struct timer_list timer;
2644 int timeout = 0; 2576 int timeout = 0;
2645 2577
2646 init_timer(&timer); 2578 sci->sc_timer.data = (unsigned long)current;
2647 timer.data = (unsigned long)current; 2579 sci->sc_timer.function = nilfs_construction_timeout;
2648 timer.function = nilfs_construction_timeout;
2649 sci->sc_timer = &timer;
2650 2580
2651 /* start sync. */ 2581 /* start sync. */
2652 sci->sc_task = current; 2582 sci->sc_task = current;
@@ -2695,7 +2625,7 @@ static int nilfs_segctor_thread(void *arg)
2695 should_sleep = 0; 2625 should_sleep = 0;
2696 else if (sci->sc_state & NILFS_SEGCTOR_COMMIT) 2626 else if (sci->sc_state & NILFS_SEGCTOR_COMMIT)
2697 should_sleep = time_before(jiffies, 2627 should_sleep = time_before(jiffies,
2698 sci->sc_timer->expires); 2628 sci->sc_timer.expires);
2699 2629
2700 if (should_sleep) { 2630 if (should_sleep) {
2701 spin_unlock(&sci->sc_state_lock); 2631 spin_unlock(&sci->sc_state_lock);
@@ -2704,7 +2634,7 @@ static int nilfs_segctor_thread(void *arg)
2704 } 2634 }
2705 finish_wait(&sci->sc_wait_daemon, &wait); 2635 finish_wait(&sci->sc_wait_daemon, &wait);
2706 timeout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) && 2636 timeout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) &&
2707 time_after_eq(jiffies, sci->sc_timer->expires)); 2637 time_after_eq(jiffies, sci->sc_timer.expires));
2708 2638
2709 if (nilfs_sb_dirty(nilfs) && nilfs_sb_need_update(nilfs)) 2639 if (nilfs_sb_dirty(nilfs) && nilfs_sb_need_update(nilfs))
2710 set_nilfs_discontinued(nilfs); 2640 set_nilfs_discontinued(nilfs);
@@ -2713,8 +2643,6 @@ static int nilfs_segctor_thread(void *arg)
2713 2643
2714 end_thread: 2644 end_thread:
2715 spin_unlock(&sci->sc_state_lock); 2645 spin_unlock(&sci->sc_state_lock);
2716 del_timer_sync(sci->sc_timer);
2717 sci->sc_timer = NULL;
2718 2646
2719 /* end sync. */ 2647 /* end sync. */
2720 sci->sc_task = NULL; 2648 sci->sc_task = NULL;
@@ -2750,13 +2678,6 @@ static void nilfs_segctor_kill_thread(struct nilfs_sc_info *sci)
2750 } 2678 }
2751} 2679}
2752 2680
2753static int nilfs_segctor_init(struct nilfs_sc_info *sci)
2754{
2755 sci->sc_seq_done = sci->sc_seq_request;
2756
2757 return nilfs_segctor_start_thread(sci);
2758}
2759
2760/* 2681/*
2761 * Setup & clean-up functions 2682 * Setup & clean-up functions
2762 */ 2683 */
@@ -2780,6 +2701,7 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi)
2780 INIT_LIST_HEAD(&sci->sc_write_logs); 2701 INIT_LIST_HEAD(&sci->sc_write_logs);
2781 INIT_LIST_HEAD(&sci->sc_gc_inodes); 2702 INIT_LIST_HEAD(&sci->sc_gc_inodes);
2782 INIT_LIST_HEAD(&sci->sc_copied_buffers); 2703 INIT_LIST_HEAD(&sci->sc_copied_buffers);
2704 init_timer(&sci->sc_timer);
2783 2705
2784 sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT; 2706 sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT;
2785 sci->sc_mjcp_freq = HZ * NILFS_SC_DEFAULT_SR_FREQ; 2707 sci->sc_mjcp_freq = HZ * NILFS_SC_DEFAULT_SR_FREQ;
@@ -2846,6 +2768,7 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
2846 2768
2847 down_write(&sbi->s_nilfs->ns_segctor_sem); 2769 down_write(&sbi->s_nilfs->ns_segctor_sem);
2848 2770
2771 del_timer_sync(&sci->sc_timer);
2849 kfree(sci); 2772 kfree(sci);
2850} 2773}
2851 2774
@@ -2880,7 +2803,7 @@ int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi)
2880 return -ENOMEM; 2803 return -ENOMEM;
2881 2804
2882 nilfs_attach_writer(nilfs, sbi); 2805 nilfs_attach_writer(nilfs, sbi);
2883 err = nilfs_segctor_init(NILFS_SC(sbi)); 2806 err = nilfs_segctor_start_thread(NILFS_SC(sbi));
2884 if (err) { 2807 if (err) {
2885 nilfs_detach_writer(nilfs, sbi); 2808 nilfs_detach_writer(nilfs, sbi);
2886 kfree(sbi->s_sc_info); 2809 kfree(sbi->s_sc_info);
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 82dfd6a686b9..dca142361ccf 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -100,7 +100,6 @@ struct nilfs_segsum_pointer {
100 * @sc_write_logs: List of segment buffers to hold logs under writing 100 * @sc_write_logs: List of segment buffers to hold logs under writing
101 * @sc_segbuf_nblocks: Number of available blocks in segment buffers. 101 * @sc_segbuf_nblocks: Number of available blocks in segment buffers.
102 * @sc_curseg: Current segment buffer 102 * @sc_curseg: Current segment buffer
103 * @sc_super_root: Pointer to the super root buffer
104 * @sc_stage: Collection stage 103 * @sc_stage: Collection stage
105 * @sc_finfo_ptr: pointer to the current finfo struct in the segment summary 104 * @sc_finfo_ptr: pointer to the current finfo struct in the segment summary
106 * @sc_binfo_ptr: pointer to the current binfo struct in the segment summary 105 * @sc_binfo_ptr: pointer to the current binfo struct in the segment summary
@@ -148,7 +147,6 @@ struct nilfs_sc_info {
148 struct list_head sc_write_logs; 147 struct list_head sc_write_logs;
149 unsigned long sc_segbuf_nblocks; 148 unsigned long sc_segbuf_nblocks;
150 struct nilfs_segment_buffer *sc_curseg; 149 struct nilfs_segment_buffer *sc_curseg;
151 struct buffer_head *sc_super_root;
152 150
153 struct nilfs_cstage sc_stage; 151 struct nilfs_cstage sc_stage;
154 152
@@ -179,7 +177,7 @@ struct nilfs_sc_info {
179 unsigned long sc_lseg_stime; /* in 1/HZ seconds */ 177 unsigned long sc_lseg_stime; /* in 1/HZ seconds */
180 unsigned long sc_watermark; 178 unsigned long sc_watermark;
181 179
182 struct timer_list *sc_timer; 180 struct timer_list sc_timer;
183 struct task_struct *sc_task; 181 struct task_struct *sc_task;
184}; 182};
185 183
@@ -219,6 +217,8 @@ enum {
219 */ 217 */
220#define NILFS_SC_DEFAULT_WATERMARK 3600 218#define NILFS_SC_DEFAULT_WATERMARK 3600
221 219
220/* super.c */
221extern struct kmem_cache *nilfs_transaction_cachep;
222 222
223/* segment.c */ 223/* segment.c */
224extern int nilfs_init_transaction_cache(void); 224extern int nilfs_init_transaction_cache(void);
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 0cdbc5e7655a..03b34b738993 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -67,6 +67,11 @@ MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem "
67 "(NILFS)"); 67 "(NILFS)");
68MODULE_LICENSE("GPL"); 68MODULE_LICENSE("GPL");
69 69
70struct kmem_cache *nilfs_inode_cachep;
71struct kmem_cache *nilfs_transaction_cachep;
72struct kmem_cache *nilfs_segbuf_cachep;
73struct kmem_cache *nilfs_btree_path_cache;
74
70static int nilfs_remount(struct super_block *sb, int *flags, char *data); 75static int nilfs_remount(struct super_block *sb, int *flags, char *data);
71 76
72/** 77/**
@@ -129,7 +134,6 @@ void nilfs_warning(struct super_block *sb, const char *function,
129 va_end(args); 134 va_end(args);
130} 135}
131 136
132static struct kmem_cache *nilfs_inode_cachep;
133 137
134struct inode *nilfs_alloc_inode_common(struct the_nilfs *nilfs) 138struct inode *nilfs_alloc_inode_common(struct the_nilfs *nilfs)
135{ 139{
@@ -155,34 +159,6 @@ void nilfs_destroy_inode(struct inode *inode)
155 kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode)); 159 kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode));
156} 160}
157 161
158static void init_once(void *obj)
159{
160 struct nilfs_inode_info *ii = obj;
161
162 INIT_LIST_HEAD(&ii->i_dirty);
163#ifdef CONFIG_NILFS_XATTR
164 init_rwsem(&ii->xattr_sem);
165#endif
166 nilfs_btnode_cache_init_once(&ii->i_btnode_cache);
167 ii->i_bmap = (struct nilfs_bmap *)&ii->i_bmap_union;
168 inode_init_once(&ii->vfs_inode);
169}
170
171static int nilfs_init_inode_cache(void)
172{
173 nilfs_inode_cachep = kmem_cache_create("nilfs2_inode_cache",
174 sizeof(struct nilfs_inode_info),
175 0, SLAB_RECLAIM_ACCOUNT,
176 init_once);
177
178 return (nilfs_inode_cachep == NULL) ? -ENOMEM : 0;
179}
180
181static inline void nilfs_destroy_inode_cache(void)
182{
183 kmem_cache_destroy(nilfs_inode_cachep);
184}
185
186static void nilfs_clear_inode(struct inode *inode) 162static void nilfs_clear_inode(struct inode *inode)
187{ 163{
188 struct nilfs_inode_info *ii = NILFS_I(inode); 164 struct nilfs_inode_info *ii = NILFS_I(inode);
@@ -266,8 +242,8 @@ int nilfs_commit_super(struct nilfs_sb_info *sbi, int dupsb)
266 int err; 242 int err;
267 243
268 /* nilfs->sem must be locked by the caller. */ 244 /* nilfs->sem must be locked by the caller. */
269 if (sbp[0]->s_magic != NILFS_SUPER_MAGIC) { 245 if (sbp[0]->s_magic != cpu_to_le16(NILFS_SUPER_MAGIC)) {
270 if (sbp[1] && sbp[1]->s_magic == NILFS_SUPER_MAGIC) 246 if (sbp[1] && sbp[1]->s_magic == cpu_to_le16(NILFS_SUPER_MAGIC))
271 nilfs_swap_super_block(nilfs); 247 nilfs_swap_super_block(nilfs);
272 else { 248 else {
273 printk(KERN_CRIT "NILFS: superblock broke on dev %s\n", 249 printk(KERN_CRIT "NILFS: superblock broke on dev %s\n",
@@ -470,10 +446,10 @@ static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
470 if (nilfs_test_opt(sbi, SNAPSHOT)) 446 if (nilfs_test_opt(sbi, SNAPSHOT))
471 seq_printf(seq, ",cp=%llu", 447 seq_printf(seq, ",cp=%llu",
472 (unsigned long long int)sbi->s_snapshot_cno); 448 (unsigned long long int)sbi->s_snapshot_cno);
473 if (nilfs_test_opt(sbi, ERRORS_RO))
474 seq_printf(seq, ",errors=remount-ro");
475 if (nilfs_test_opt(sbi, ERRORS_PANIC)) 449 if (nilfs_test_opt(sbi, ERRORS_PANIC))
476 seq_printf(seq, ",errors=panic"); 450 seq_printf(seq, ",errors=panic");
451 if (nilfs_test_opt(sbi, ERRORS_CONT))
452 seq_printf(seq, ",errors=continue");
477 if (nilfs_test_opt(sbi, STRICT_ORDER)) 453 if (nilfs_test_opt(sbi, STRICT_ORDER))
478 seq_printf(seq, ",order=strict"); 454 seq_printf(seq, ",order=strict");
479 if (nilfs_test_opt(sbi, NORECOVERY)) 455 if (nilfs_test_opt(sbi, NORECOVERY))
@@ -631,7 +607,7 @@ nilfs_set_default_options(struct nilfs_sb_info *sbi,
631 struct nilfs_super_block *sbp) 607 struct nilfs_super_block *sbp)
632{ 608{
633 sbi->s_mount_opt = 609 sbi->s_mount_opt =
634 NILFS_MOUNT_ERRORS_CONT | NILFS_MOUNT_BARRIER; 610 NILFS_MOUNT_ERRORS_RO | NILFS_MOUNT_BARRIER;
635} 611}
636 612
637static int nilfs_setup_super(struct nilfs_sb_info *sbi) 613static int nilfs_setup_super(struct nilfs_sb_info *sbi)
@@ -749,6 +725,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
749 sb->s_export_op = &nilfs_export_ops; 725 sb->s_export_op = &nilfs_export_ops;
750 sb->s_root = NULL; 726 sb->s_root = NULL;
751 sb->s_time_gran = 1; 727 sb->s_time_gran = 1;
728 sb->s_bdi = nilfs->ns_bdi;
752 729
753 err = load_nilfs(nilfs, sbi); 730 err = load_nilfs(nilfs, sbi);
754 if (err) 731 if (err)
@@ -777,9 +754,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
777 goto failed_sbi; 754 goto failed_sbi;
778 } 755 }
779 cno = sbi->s_snapshot_cno; 756 cno = sbi->s_snapshot_cno;
780 } else 757 }
781 /* Read-only mount */
782 sbi->s_snapshot_cno = cno;
783 } 758 }
784 759
785 err = nilfs_attach_checkpoint(sbi, cno); 760 err = nilfs_attach_checkpoint(sbi, cno);
@@ -848,7 +823,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
848 struct the_nilfs *nilfs = sbi->s_nilfs; 823 struct the_nilfs *nilfs = sbi->s_nilfs;
849 unsigned long old_sb_flags; 824 unsigned long old_sb_flags;
850 struct nilfs_mount_options old_opts; 825 struct nilfs_mount_options old_opts;
851 int err; 826 int was_snapshot, err;
852 827
853 lock_kernel(); 828 lock_kernel();
854 829
@@ -856,6 +831,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
856 old_sb_flags = sb->s_flags; 831 old_sb_flags = sb->s_flags;
857 old_opts.mount_opt = sbi->s_mount_opt; 832 old_opts.mount_opt = sbi->s_mount_opt;
858 old_opts.snapshot_cno = sbi->s_snapshot_cno; 833 old_opts.snapshot_cno = sbi->s_snapshot_cno;
834 was_snapshot = nilfs_test_opt(sbi, SNAPSHOT);
859 835
860 if (!parse_options(data, sb)) { 836 if (!parse_options(data, sb)) {
861 err = -EINVAL; 837 err = -EINVAL;
@@ -863,20 +839,32 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
863 } 839 }
864 sb->s_flags = (sb->s_flags & ~MS_POSIXACL); 840 sb->s_flags = (sb->s_flags & ~MS_POSIXACL);
865 841
866 if ((*flags & MS_RDONLY) && 842 err = -EINVAL;
867 sbi->s_snapshot_cno != old_opts.snapshot_cno) { 843 if (was_snapshot) {
868 printk(KERN_WARNING "NILFS (device %s): couldn't " 844 if (!(*flags & MS_RDONLY)) {
869 "remount to a different snapshot.\n", 845 printk(KERN_ERR "NILFS (device %s): cannot remount "
870 sb->s_id); 846 "snapshot read/write.\n",
871 err = -EINVAL; 847 sb->s_id);
872 goto restore_opts; 848 goto restore_opts;
849 } else if (sbi->s_snapshot_cno != old_opts.snapshot_cno) {
850 printk(KERN_ERR "NILFS (device %s): cannot "
851 "remount to a different snapshot.\n",
852 sb->s_id);
853 goto restore_opts;
854 }
855 } else {
856 if (nilfs_test_opt(sbi, SNAPSHOT)) {
857 printk(KERN_ERR "NILFS (device %s): cannot change "
858 "a regular mount to a snapshot.\n",
859 sb->s_id);
860 goto restore_opts;
861 }
873 } 862 }
874 863
875 if (!nilfs_valid_fs(nilfs)) { 864 if (!nilfs_valid_fs(nilfs)) {
876 printk(KERN_WARNING "NILFS (device %s): couldn't " 865 printk(KERN_WARNING "NILFS (device %s): couldn't "
877 "remount because the filesystem is in an " 866 "remount because the filesystem is in an "
878 "incomplete recovery state.\n", sb->s_id); 867 "incomplete recovery state.\n", sb->s_id);
879 err = -EINVAL;
880 goto restore_opts; 868 goto restore_opts;
881 } 869 }
882 870
@@ -887,9 +875,6 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
887 nilfs_detach_segment_constructor(sbi); 875 nilfs_detach_segment_constructor(sbi);
888 sb->s_flags |= MS_RDONLY; 876 sb->s_flags |= MS_RDONLY;
889 877
890 sbi->s_snapshot_cno = nilfs_last_cno(nilfs);
891 /* nilfs_set_opt(sbi, SNAPSHOT); */
892
893 /* 878 /*
894 * Remounting a valid RW partition RDONLY, so set 879 * Remounting a valid RW partition RDONLY, so set
895 * the RDONLY flag and then mark the partition as valid again. 880 * the RDONLY flag and then mark the partition as valid again.
@@ -908,24 +893,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
908 * store the current valid flag. (It may have been changed 893 * store the current valid flag. (It may have been changed
909 * by fsck since we originally mounted the partition.) 894 * by fsck since we originally mounted the partition.)
910 */ 895 */
911 if (nilfs->ns_current && nilfs->ns_current != sbi) {
912 printk(KERN_WARNING "NILFS (device %s): couldn't "
913 "remount because an RW-mount exists.\n",
914 sb->s_id);
915 err = -EBUSY;
916 goto restore_opts;
917 }
918 if (sbi->s_snapshot_cno != nilfs_last_cno(nilfs)) {
919 printk(KERN_WARNING "NILFS (device %s): couldn't "
920 "remount because the current RO-mount is not "
921 "the latest one.\n",
922 sb->s_id);
923 err = -EINVAL;
924 goto restore_opts;
925 }
926 sb->s_flags &= ~MS_RDONLY; 896 sb->s_flags &= ~MS_RDONLY;
927 nilfs_clear_opt(sbi, SNAPSHOT);
928 sbi->s_snapshot_cno = 0;
929 897
930 err = nilfs_attach_segment_constructor(sbi); 898 err = nilfs_attach_segment_constructor(sbi);
931 if (err) 899 if (err)
@@ -934,8 +902,6 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
934 down_write(&nilfs->ns_sem); 902 down_write(&nilfs->ns_sem);
935 nilfs_setup_super(sbi); 903 nilfs_setup_super(sbi);
936 up_write(&nilfs->ns_sem); 904 up_write(&nilfs->ns_sem);
937
938 nilfs->ns_current = sbi;
939 } 905 }
940 out: 906 out:
941 up_write(&nilfs->ns_super_sem); 907 up_write(&nilfs->ns_super_sem);
@@ -1021,10 +987,14 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
1021{ 987{
1022 struct nilfs_super_data sd; 988 struct nilfs_super_data sd;
1023 struct super_block *s; 989 struct super_block *s;
990 fmode_t mode = FMODE_READ;
1024 struct the_nilfs *nilfs; 991 struct the_nilfs *nilfs;
1025 int err, need_to_close = 1; 992 int err, need_to_close = 1;
1026 993
1027 sd.bdev = open_bdev_exclusive(dev_name, flags, fs_type); 994 if (!(flags & MS_RDONLY))
995 mode |= FMODE_WRITE;
996
997 sd.bdev = open_bdev_exclusive(dev_name, mode, fs_type);
1028 if (IS_ERR(sd.bdev)) 998 if (IS_ERR(sd.bdev))
1029 return PTR_ERR(sd.bdev); 999 return PTR_ERR(sd.bdev);
1030 1000
@@ -1091,10 +1061,12 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
1091 1061
1092 /* New superblock instance created */ 1062 /* New superblock instance created */
1093 s->s_flags = flags; 1063 s->s_flags = flags;
1064 s->s_mode = mode;
1094 strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id)); 1065 strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id));
1095 sb_set_blocksize(s, block_size(sd.bdev)); 1066 sb_set_blocksize(s, block_size(sd.bdev));
1096 1067
1097 err = nilfs_fill_super(s, data, flags & MS_VERBOSE, nilfs); 1068 err = nilfs_fill_super(s, data, flags & MS_SILENT ? 1 : 0,
1069 nilfs);
1098 if (err) 1070 if (err)
1099 goto cancel_new; 1071 goto cancel_new;
1100 1072
@@ -1105,7 +1077,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
1105 mutex_unlock(&nilfs->ns_mount_mutex); 1077 mutex_unlock(&nilfs->ns_mount_mutex);
1106 put_nilfs(nilfs); 1078 put_nilfs(nilfs);
1107 if (need_to_close) 1079 if (need_to_close)
1108 close_bdev_exclusive(sd.bdev, flags); 1080 close_bdev_exclusive(sd.bdev, mode);
1109 simple_set_mnt(mnt, s); 1081 simple_set_mnt(mnt, s);
1110 return 0; 1082 return 0;
1111 1083
@@ -1113,7 +1085,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
1113 mutex_unlock(&nilfs->ns_mount_mutex); 1085 mutex_unlock(&nilfs->ns_mount_mutex);
1114 put_nilfs(nilfs); 1086 put_nilfs(nilfs);
1115 failed: 1087 failed:
1116 close_bdev_exclusive(sd.bdev, flags); 1088 close_bdev_exclusive(sd.bdev, mode);
1117 1089
1118 return err; 1090 return err;
1119 1091
@@ -1123,7 +1095,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
1123 put_nilfs(nilfs); 1095 put_nilfs(nilfs);
1124 deactivate_locked_super(s); 1096 deactivate_locked_super(s);
1125 /* 1097 /*
1126 * deactivate_super() invokes close_bdev_exclusive(). 1098 * deactivate_locked_super() invokes close_bdev_exclusive().
1127 * We must finish all post-cleaning before this call; 1099 * We must finish all post-cleaning before this call;
1128 * put_nilfs() needs the block device. 1100 * put_nilfs() needs the block device.
1129 */ 1101 */
@@ -1138,54 +1110,93 @@ struct file_system_type nilfs_fs_type = {
1138 .fs_flags = FS_REQUIRES_DEV, 1110 .fs_flags = FS_REQUIRES_DEV,
1139}; 1111};
1140 1112
1141static int __init init_nilfs_fs(void) 1113static void nilfs_inode_init_once(void *obj)
1142{ 1114{
1143 int err; 1115 struct nilfs_inode_info *ii = obj;
1144
1145 err = nilfs_init_inode_cache();
1146 if (err)
1147 goto failed;
1148 1116
1149 err = nilfs_init_transaction_cache(); 1117 INIT_LIST_HEAD(&ii->i_dirty);
1150 if (err) 1118#ifdef CONFIG_NILFS_XATTR
1151 goto failed_inode_cache; 1119 init_rwsem(&ii->xattr_sem);
1120#endif
1121 nilfs_btnode_cache_init_once(&ii->i_btnode_cache);
1122 ii->i_bmap = (struct nilfs_bmap *)&ii->i_bmap_union;
1123 inode_init_once(&ii->vfs_inode);
1124}
1152 1125
1153 err = nilfs_init_segbuf_cache(); 1126static void nilfs_segbuf_init_once(void *obj)
1154 if (err) 1127{
1155 goto failed_transaction_cache; 1128 memset(obj, 0, sizeof(struct nilfs_segment_buffer));
1129}
1156 1130
1157 err = nilfs_btree_path_cache_init(); 1131static void nilfs_destroy_cachep(void)
1158 if (err) 1132{
1159 goto failed_segbuf_cache; 1133 if (nilfs_inode_cachep)
1134 kmem_cache_destroy(nilfs_inode_cachep);
1135 if (nilfs_transaction_cachep)
1136 kmem_cache_destroy(nilfs_transaction_cachep);
1137 if (nilfs_segbuf_cachep)
1138 kmem_cache_destroy(nilfs_segbuf_cachep);
1139 if (nilfs_btree_path_cache)
1140 kmem_cache_destroy(nilfs_btree_path_cache);
1141}
1160 1142
1161 err = register_filesystem(&nilfs_fs_type); 1143static int __init nilfs_init_cachep(void)
1162 if (err) 1144{
1163 goto failed_btree_path_cache; 1145 nilfs_inode_cachep = kmem_cache_create("nilfs2_inode_cache",
1146 sizeof(struct nilfs_inode_info), 0,
1147 SLAB_RECLAIM_ACCOUNT, nilfs_inode_init_once);
1148 if (!nilfs_inode_cachep)
1149 goto fail;
1150
1151 nilfs_transaction_cachep = kmem_cache_create("nilfs2_transaction_cache",
1152 sizeof(struct nilfs_transaction_info), 0,
1153 SLAB_RECLAIM_ACCOUNT, NULL);
1154 if (!nilfs_transaction_cachep)
1155 goto fail;
1156
1157 nilfs_segbuf_cachep = kmem_cache_create("nilfs2_segbuf_cache",
1158 sizeof(struct nilfs_segment_buffer), 0,
1159 SLAB_RECLAIM_ACCOUNT, nilfs_segbuf_init_once);
1160 if (!nilfs_segbuf_cachep)
1161 goto fail;
1162
1163 nilfs_btree_path_cache = kmem_cache_create("nilfs2_btree_path_cache",
1164 sizeof(struct nilfs_btree_path) * NILFS_BTREE_LEVEL_MAX,
1165 0, 0, NULL);
1166 if (!nilfs_btree_path_cache)
1167 goto fail;
1164 1168
1165 return 0; 1169 return 0;
1166 1170
1167 failed_btree_path_cache: 1171fail:
1168 nilfs_btree_path_cache_destroy(); 1172 nilfs_destroy_cachep();
1173 return -ENOMEM;
1174}
1175
1176static int __init init_nilfs_fs(void)
1177{
1178 int err;
1169 1179
1170 failed_segbuf_cache: 1180 err = nilfs_init_cachep();
1171 nilfs_destroy_segbuf_cache(); 1181 if (err)
1182 goto fail;
1172 1183
1173 failed_transaction_cache: 1184 err = register_filesystem(&nilfs_fs_type);
1174 nilfs_destroy_transaction_cache(); 1185 if (err)
1186 goto free_cachep;
1175 1187
1176 failed_inode_cache: 1188 printk(KERN_INFO "NILFS version 2 loaded\n");
1177 nilfs_destroy_inode_cache(); 1189 return 0;
1178 1190
1179 failed: 1191free_cachep:
1192 nilfs_destroy_cachep();
1193fail:
1180 return err; 1194 return err;
1181} 1195}
1182 1196
1183static void __exit exit_nilfs_fs(void) 1197static void __exit exit_nilfs_fs(void)
1184{ 1198{
1185 nilfs_destroy_segbuf_cache(); 1199 nilfs_destroy_cachep();
1186 nilfs_destroy_transaction_cache();
1187 nilfs_destroy_inode_cache();
1188 nilfs_btree_path_cache_destroy();
1189 unregister_filesystem(&nilfs_fs_type); 1200 unregister_filesystem(&nilfs_fs_type);
1190} 1201}
1191 1202
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 33871f7e4f01..8c1097327abc 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -486,11 +486,15 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
486 printk(KERN_WARNING 486 printk(KERN_WARNING
487 "NILFS warning: unable to read secondary superblock\n"); 487 "NILFS warning: unable to read secondary superblock\n");
488 488
489 /*
490 * Compare two super blocks and set 1 in swp if the secondary
491 * super block is valid and newer. Otherwise, set 0 in swp.
492 */
489 valid[0] = nilfs_valid_sb(sbp[0]); 493 valid[0] = nilfs_valid_sb(sbp[0]);
490 valid[1] = nilfs_valid_sb(sbp[1]); 494 valid[1] = nilfs_valid_sb(sbp[1]);
491 swp = valid[1] && 495 swp = valid[1] && (!valid[0] ||
492 (!valid[0] || 496 le64_to_cpu(sbp[1]->s_last_cno) >
493 le64_to_cpu(sbp[1]->s_wtime) > le64_to_cpu(sbp[0]->s_wtime)); 497 le64_to_cpu(sbp[0]->s_last_cno));
494 498
495 if (valid[swp] && nilfs_sb2_bad_offset(sbp[swp], sb2off)) { 499 if (valid[swp] && nilfs_sb2_bad_offset(sbp[swp], sb2off)) {
496 brelse(sbh[1]); 500 brelse(sbh[1]);
@@ -670,7 +674,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
670 start * sects_per_block, 674 start * sects_per_block,
671 nblocks * sects_per_block, 675 nblocks * sects_per_block,
672 GFP_NOFS, 676 GFP_NOFS,
673 DISCARD_FL_BARRIER); 677 BLKDEV_IFL_BARRIER);
674 if (ret < 0) 678 if (ret < 0)
675 return ret; 679 return ret;
676 nblocks = 0; 680 nblocks = 0;
@@ -680,7 +684,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
680 ret = blkdev_issue_discard(nilfs->ns_bdev, 684 ret = blkdev_issue_discard(nilfs->ns_bdev,
681 start * sects_per_block, 685 start * sects_per_block,
682 nblocks * sects_per_block, 686 nblocks * sects_per_block,
683 GFP_NOFS, DISCARD_FL_BARRIER); 687 GFP_NOFS, BLKDEV_IFL_BARRIER);
684 return ret; 688 return ret;
685} 689}
686 690
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index e9795f1724d7..1ab974533697 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -29,6 +29,7 @@
29#include <linux/fs.h> 29#include <linux/fs.h>
30#include <linux/blkdev.h> 30#include <linux/blkdev.h>
31#include <linux/backing-dev.h> 31#include <linux/backing-dev.h>
32#include <linux/slab.h>
32#include "sb.h" 33#include "sb.h"
33 34
34/* the_nilfs struct */ 35/* the_nilfs struct */
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 037e878e03fc..fcc2f064af83 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -18,6 +18,7 @@
18 18
19#include <linux/dcache.h> 19#include <linux/dcache.h>
20#include <linux/fs.h> 20#include <linux/fs.h>
21#include <linux/gfp.h>
21#include <linux/init.h> 22#include <linux/init.h>
22#include <linux/module.h> 23#include <linux/module.h>
23#include <linux/srcu.h> 24#include <linux/srcu.h>
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 3165d85aada2..0399bcbe09c8 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -87,7 +87,6 @@
87#include <linux/kernel.h> 87#include <linux/kernel.h>
88#include <linux/module.h> 88#include <linux/module.h>
89#include <linux/mutex.h> 89#include <linux/mutex.h>
90#include <linux/slab.h>
91#include <linux/spinlock.h> 90#include <linux/spinlock.h>
92#include <linux/writeback.h> /* for inode_lock */ 91#include <linux/writeback.h> /* for inode_lock */
93 92
diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig
index 3e56dbffe729..b3a159b21cfd 100644
--- a/fs/notify/inotify/Kconfig
+++ b/fs/notify/inotify/Kconfig
@@ -15,6 +15,7 @@ config INOTIFY
15 15
16config INOTIFY_USER 16config INOTIFY_USER
17 bool "Inotify support for userspace" 17 bool "Inotify support for userspace"
18 select ANON_INODES
18 select FSNOTIFY 19 select FSNOTIFY
19 default y 20 default y
20 ---help--- 21 ---help---
diff --git a/fs/notify/inotify/inotify.c b/fs/notify/inotify/inotify.c
index 40b1cf914ccb..27b75ebc7460 100644
--- a/fs/notify/inotify/inotify.c
+++ b/fs/notify/inotify/inotify.c
@@ -110,14 +110,10 @@ EXPORT_SYMBOL_GPL(get_inotify_watch);
110int pin_inotify_watch(struct inotify_watch *watch) 110int pin_inotify_watch(struct inotify_watch *watch)
111{ 111{
112 struct super_block *sb = watch->inode->i_sb; 112 struct super_block *sb = watch->inode->i_sb;
113 spin_lock(&sb_lock); 113 if (atomic_inc_not_zero(&sb->s_active)) {
114 if (sb->s_count >= S_BIAS) {
115 atomic_inc(&sb->s_active);
116 spin_unlock(&sb_lock);
117 atomic_inc(&watch->count); 114 atomic_inc(&watch->count);
118 return 1; 115 return 1;
119 } 116 }
120 spin_unlock(&sb_lock);
121 return 0; 117 return 0;
122} 118}
123 119
@@ -515,34 +511,8 @@ EXPORT_SYMBOL_GPL(inotify_init_watch);
515 * done. Cleanup is just deactivate_super(). However, that leaves a messy 511 * done. Cleanup is just deactivate_super(). However, that leaves a messy
516 * case - what if we *are* racing with umount() and active references to 512 * case - what if we *are* racing with umount() and active references to
517 * superblock can't be acquired anymore? We can bump ->s_count, grab 513 * superblock can't be acquired anymore? We can bump ->s_count, grab
518 * ->s_umount, which will almost certainly wait until the superblock is shut 514 * ->s_umount, which will wait until the superblock is shut down and the
519 * down and the watch in question is pining for fjords. That's fine, but 515 * watch in question is pining for fjords.
520 * there is a problem - we might have hit the window between ->s_active
521 * getting to 0 / ->s_count - below S_BIAS (i.e. the moment when superblock
522 * is past the point of no return and is heading for shutdown) and the
523 * moment when deactivate_super() acquires ->s_umount. We could just do
524 * drop_super() yield() and retry, but that's rather antisocial and this
525 * stuff is luser-triggerable. OTOH, having grabbed ->s_umount and having
526 * found that we'd got there first (i.e. that ->s_root is non-NULL) we know
527 * that we won't race with inotify_umount_inodes(). So we could grab a
528 * reference to watch and do the rest as above, just with drop_super() instead
529 * of deactivate_super(), right? Wrong. We had to drop ih->mutex before we
530 * could grab ->s_umount. So the watch could've been gone already.
531 *
532 * That still can be dealt with - we need to save watch->wd, do idr_find()
533 * and compare its result with our pointer. If they match, we either have
534 * the damn thing still alive or we'd lost not one but two races at once,
535 * the watch had been killed and a new one got created with the same ->wd
536 * at the same address. That couldn't have happened in inotify_destroy(),
537 * but inotify_rm_wd() could run into that. Still, "new one got created"
538 * is not a problem - we have every right to kill it or leave it alone,
539 * whatever's more convenient.
540 *
541 * So we can use idr_find(...) == watch && watch->inode->i_sb == sb as
542 * "grab it and kill it" check. If it's been our original watch, we are
543 * fine, if it's a newcomer - nevermind, just pretend that we'd won the
544 * race and kill the fscker anyway; we are safe since we know that its
545 * superblock won't be going away.
546 * 516 *
547 * And yes, this is far beyond mere "not very pretty"; so's the entire 517 * And yes, this is far beyond mere "not very pretty"; so's the entire
548 * concept of inotify to start with. 518 * concept of inotify to start with.
@@ -556,57 +526,31 @@ EXPORT_SYMBOL_GPL(inotify_init_watch);
556 * Called with ih->mutex held, drops it. Possible return values: 526 * Called with ih->mutex held, drops it. Possible return values:
557 * 0 - nothing to do, it has died 527 * 0 - nothing to do, it has died
558 * 1 - remove it, drop the reference and deactivate_super() 528 * 1 - remove it, drop the reference and deactivate_super()
559 * 2 - remove it, drop the reference and drop_super(); we tried hard to avoid
560 * that variant, since it involved a lot of PITA, but that's the best that
561 * could've been done.
562 */ 529 */
563static int pin_to_kill(struct inotify_handle *ih, struct inotify_watch *watch) 530static int pin_to_kill(struct inotify_handle *ih, struct inotify_watch *watch)
564{ 531{
565 struct super_block *sb = watch->inode->i_sb; 532 struct super_block *sb = watch->inode->i_sb;
566 s32 wd = watch->wd;
567 533
568 spin_lock(&sb_lock); 534 if (atomic_inc_not_zero(&sb->s_active)) {
569 if (sb->s_count >= S_BIAS) {
570 atomic_inc(&sb->s_active);
571 spin_unlock(&sb_lock);
572 get_inotify_watch(watch); 535 get_inotify_watch(watch);
573 mutex_unlock(&ih->mutex); 536 mutex_unlock(&ih->mutex);
574 return 1; /* the best outcome */ 537 return 1; /* the best outcome */
575 } 538 }
539 spin_lock(&sb_lock);
576 sb->s_count++; 540 sb->s_count++;
577 spin_unlock(&sb_lock); 541 spin_unlock(&sb_lock);
578 mutex_unlock(&ih->mutex); /* can't grab ->s_umount under it */ 542 mutex_unlock(&ih->mutex); /* can't grab ->s_umount under it */
579 down_read(&sb->s_umount); 543 down_read(&sb->s_umount);
580 if (likely(!sb->s_root)) { 544 /* fs is already shut down; the watch is dead */
581 /* fs is already shut down; the watch is dead */ 545 drop_super(sb);
582 drop_super(sb); 546 return 0;
583 return 0;
584 }
585 /* raced with the final deactivate_super() */
586 mutex_lock(&ih->mutex);
587 if (idr_find(&ih->idr, wd) != watch || watch->inode->i_sb != sb) {
588 /* the watch is dead */
589 mutex_unlock(&ih->mutex);
590 drop_super(sb);
591 return 0;
592 }
593 /* still alive or freed and reused with the same sb and wd; kill */
594 get_inotify_watch(watch);
595 mutex_unlock(&ih->mutex);
596 return 2;
597} 547}
598 548
599static void unpin_and_kill(struct inotify_watch *watch, int how) 549static void unpin_and_kill(struct inotify_watch *watch)
600{ 550{
601 struct super_block *sb = watch->inode->i_sb; 551 struct super_block *sb = watch->inode->i_sb;
602 put_inotify_watch(watch); 552 put_inotify_watch(watch);
603 switch (how) { 553 deactivate_super(sb);
604 case 1:
605 deactivate_super(sb);
606 break;
607 case 2:
608 drop_super(sb);
609 }
610} 554}
611 555
612/** 556/**
@@ -628,7 +572,6 @@ void inotify_destroy(struct inotify_handle *ih)
628 struct list_head *watches; 572 struct list_head *watches;
629 struct super_block *sb; 573 struct super_block *sb;
630 struct inode *inode; 574 struct inode *inode;
631 int how;
632 575
633 mutex_lock(&ih->mutex); 576 mutex_lock(&ih->mutex);
634 watches = &ih->watches; 577 watches = &ih->watches;
@@ -638,8 +581,7 @@ void inotify_destroy(struct inotify_handle *ih)
638 } 581 }
639 watch = list_first_entry(watches, struct inotify_watch, h_list); 582 watch = list_first_entry(watches, struct inotify_watch, h_list);
640 sb = watch->inode->i_sb; 583 sb = watch->inode->i_sb;
641 how = pin_to_kill(ih, watch); 584 if (!pin_to_kill(ih, watch))
642 if (!how)
643 continue; 585 continue;
644 586
645 inode = watch->inode; 587 inode = watch->inode;
@@ -654,7 +596,7 @@ void inotify_destroy(struct inotify_handle *ih)
654 596
655 mutex_unlock(&ih->mutex); 597 mutex_unlock(&ih->mutex);
656 mutex_unlock(&inode->inotify_mutex); 598 mutex_unlock(&inode->inotify_mutex);
657 unpin_and_kill(watch, how); 599 unpin_and_kill(watch);
658 } 600 }
659 601
660 /* free this handle: the put matching the get in inotify_init() */ 602 /* free this handle: the put matching the get in inotify_init() */
@@ -857,7 +799,6 @@ int inotify_rm_wd(struct inotify_handle *ih, u32 wd)
857 struct inotify_watch *watch; 799 struct inotify_watch *watch;
858 struct super_block *sb; 800 struct super_block *sb;
859 struct inode *inode; 801 struct inode *inode;
860 int how;
861 802
862 mutex_lock(&ih->mutex); 803 mutex_lock(&ih->mutex);
863 watch = idr_find(&ih->idr, wd); 804 watch = idr_find(&ih->idr, wd);
@@ -866,8 +807,7 @@ int inotify_rm_wd(struct inotify_handle *ih, u32 wd)
866 return -EINVAL; 807 return -EINVAL;
867 } 808 }
868 sb = watch->inode->i_sb; 809 sb = watch->inode->i_sb;
869 how = pin_to_kill(ih, watch); 810 if (!pin_to_kill(ih, watch))
870 if (!how)
871 return 0; 811 return 0;
872 812
873 inode = watch->inode; 813 inode = watch->inode;
@@ -881,7 +821,7 @@ int inotify_rm_wd(struct inotify_handle *ih, u32 wd)
881 821
882 mutex_unlock(&ih->mutex); 822 mutex_unlock(&ih->mutex);
883 mutex_unlock(&inode->inotify_mutex); 823 mutex_unlock(&inode->inotify_mutex);
884 unpin_and_kill(watch, how); 824 unpin_and_kill(watch);
885 825
886 return 0; 826 return 0;
887} 827}
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 1afb0a10229f..e27960cd76ab 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -28,6 +28,7 @@
28#include <linux/path.h> /* struct path */ 28#include <linux/path.h> /* struct path */
29#include <linux/slab.h> /* kmem_* */ 29#include <linux/slab.h> /* kmem_* */
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/sched.h>
31 32
32#include "inotify.h" 33#include "inotify.h"
33 34
@@ -146,6 +147,7 @@ static void inotify_free_group_priv(struct fsnotify_group *group)
146 idr_for_each(&group->inotify_data.idr, idr_callback, group); 147 idr_for_each(&group->inotify_data.idr, idr_callback, group);
147 idr_remove_all(&group->inotify_data.idr); 148 idr_remove_all(&group->inotify_data.idr);
148 idr_destroy(&group->inotify_data.idr); 149 idr_destroy(&group->inotify_data.idr);
150 free_uid(group->inotify_data.user);
149} 151}
150 152
151void inotify_free_event_priv(struct fsnotify_event_private_data *fsn_event_priv) 153void inotify_free_event_priv(struct fsnotify_event_private_data *fsn_event_priv)
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 472cdf29ef82..e46ca685b9be 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -546,21 +546,24 @@ retry:
546 if (unlikely(!idr_pre_get(&group->inotify_data.idr, GFP_KERNEL))) 546 if (unlikely(!idr_pre_get(&group->inotify_data.idr, GFP_KERNEL)))
547 goto out_err; 547 goto out_err;
548 548
549 /* we are putting the mark on the idr, take a reference */
550 fsnotify_get_mark(&tmp_ientry->fsn_entry);
551
549 spin_lock(&group->inotify_data.idr_lock); 552 spin_lock(&group->inotify_data.idr_lock);
550 ret = idr_get_new_above(&group->inotify_data.idr, &tmp_ientry->fsn_entry, 553 ret = idr_get_new_above(&group->inotify_data.idr, &tmp_ientry->fsn_entry,
551 group->inotify_data.last_wd+1, 554 group->inotify_data.last_wd+1,
552 &tmp_ientry->wd); 555 &tmp_ientry->wd);
553 spin_unlock(&group->inotify_data.idr_lock); 556 spin_unlock(&group->inotify_data.idr_lock);
554 if (ret) { 557 if (ret) {
558 /* we didn't get on the idr, drop the idr reference */
559 fsnotify_put_mark(&tmp_ientry->fsn_entry);
560
555 /* idr was out of memory allocate and try again */ 561 /* idr was out of memory allocate and try again */
556 if (ret == -EAGAIN) 562 if (ret == -EAGAIN)
557 goto retry; 563 goto retry;
558 goto out_err; 564 goto out_err;
559 } 565 }
560 566
561 /* we put the mark on the idr, take a reference */
562 fsnotify_get_mark(&tmp_ientry->fsn_entry);
563
564 /* we are on the idr, now get on the inode */ 567 /* we are on the idr, now get on the inode */
565 ret = fsnotify_add_mark(&tmp_ientry->fsn_entry, group, inode); 568 ret = fsnotify_add_mark(&tmp_ientry->fsn_entry, group, inode);
566 if (ret) { 569 if (ret) {
@@ -578,16 +581,13 @@ retry:
578 /* return the watch descriptor for this new entry */ 581 /* return the watch descriptor for this new entry */
579 ret = tmp_ientry->wd; 582 ret = tmp_ientry->wd;
580 583
581 /* match the ref from fsnotify_init_markentry() */
582 fsnotify_put_mark(&tmp_ientry->fsn_entry);
583
584 /* if this mark added a new event update the group mask */ 584 /* if this mark added a new event update the group mask */
585 if (mask & ~group->mask) 585 if (mask & ~group->mask)
586 fsnotify_recalc_group_mask(group); 586 fsnotify_recalc_group_mask(group);
587 587
588out_err: 588out_err:
589 if (ret < 0) 589 /* match the ref from fsnotify_init_markentry() */
590 kmem_cache_free(inotify_inode_mark_cachep, tmp_ientry); 590 fsnotify_put_mark(&tmp_ientry->fsn_entry);
591 591
592 return ret; 592 return ret;
593} 593}
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index cfce53cb65d7..c3c2c7ac9020 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -23,6 +23,7 @@
23 23
24#include <linux/errno.h> 24#include <linux/errno.h>
25#include <linux/fs.h> 25#include <linux/fs.h>
26#include <linux/gfp.h>
26#include <linux/mm.h> 27#include <linux/mm.h>
27#include <linux/pagemap.h> 28#include <linux/pagemap.h>
28#include <linux/swap.h> 29#include <linux/swap.h>
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index 50d3b0c258e3..f5094ee224c1 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -22,6 +22,7 @@
22 22
23#include <linux/buffer_head.h> 23#include <linux/buffer_head.h>
24#include <linux/sched.h> 24#include <linux/sched.h>
25#include <linux/slab.h>
25#include <linux/swap.h> 26#include <linux/swap.h>
26#include <linux/writeback.h> 27#include <linux/writeback.h>
27 28
diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c
index 08f7530e9341..6551c7cbad92 100644
--- a/fs/ntfs/compress.c
+++ b/fs/ntfs/compress.c
@@ -25,6 +25,7 @@
25#include <linux/buffer_head.h> 25#include <linux/buffer_head.h>
26#include <linux/blkdev.h> 26#include <linux/blkdev.h>
27#include <linux/vmalloc.h> 27#include <linux/vmalloc.h>
28#include <linux/slab.h>
28 29
29#include "attrib.h" 30#include "attrib.h"
30#include "inode.h" 31#include "inode.h"
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index 9173e82a45d1..0f48e7c5d9e1 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -21,6 +21,7 @@
21 */ 21 */
22 22
23#include <linux/buffer_head.h> 23#include <linux/buffer_head.h>
24#include <linux/slab.h>
24 25
25#include "dir.h" 26#include "dir.h"
26#include "aops.h" 27#include "aops.h"
@@ -1526,10 +1527,9 @@ static int ntfs_dir_open(struct inode *vi, struct file *filp)
1526 * this problem for now. We do write the $BITMAP attribute if it is present 1527 * this problem for now. We do write the $BITMAP attribute if it is present
1527 * which is the important one for a directory so things are not too bad. 1528 * which is the important one for a directory so things are not too bad.
1528 */ 1529 */
1529static int ntfs_dir_fsync(struct file *filp, struct dentry *dentry, 1530static int ntfs_dir_fsync(struct file *filp, int datasync)
1530 int datasync)
1531{ 1531{
1532 struct inode *bmp_vi, *vi = dentry->d_inode; 1532 struct inode *bmp_vi, *vi = filp->f_mapping->host;
1533 int err, ret; 1533 int err, ret;
1534 ntfs_attr na; 1534 ntfs_attr na;
1535 1535
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index b681c71d7069..113ebd9f25a4 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -20,6 +20,7 @@
20 */ 20 */
21 21
22#include <linux/buffer_head.h> 22#include <linux/buffer_head.h>
23#include <linux/gfp.h>
23#include <linux/pagemap.h> 24#include <linux/pagemap.h>
24#include <linux/pagevec.h> 25#include <linux/pagevec.h>
25#include <linux/sched.h> 26#include <linux/sched.h>
@@ -97,9 +98,6 @@ static int ntfs_file_open(struct inode *vi, struct file *filp)
97 * the page at all. For a more detailed explanation see ntfs_truncate() in 98 * the page at all. For a more detailed explanation see ntfs_truncate() in
98 * fs/ntfs/inode.c. 99 * fs/ntfs/inode.c.
99 * 100 *
100 * @cached_page and @lru_pvec are just optimizations for dealing with multiple
101 * pages.
102 *
103 * Return 0 on success and -errno on error. In the case that an error is 101 * Return 0 on success and -errno on error. In the case that an error is
104 * encountered it is possible that the initialized size will already have been 102 * encountered it is possible that the initialized size will already have been
105 * incremented some way towards @new_init_size but it is guaranteed that if 103 * incremented some way towards @new_init_size but it is guaranteed that if
@@ -109,8 +107,7 @@ static int ntfs_file_open(struct inode *vi, struct file *filp)
109 * Locking: i_mutex on the vfs inode corrseponsind to the ntfs inode @ni must be 107 * Locking: i_mutex on the vfs inode corrseponsind to the ntfs inode @ni must be
110 * held by the caller. 108 * held by the caller.
111 */ 109 */
112static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size, 110static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size)
113 struct page **cached_page, struct pagevec *lru_pvec)
114{ 111{
115 s64 old_init_size; 112 s64 old_init_size;
116 loff_t old_i_size; 113 loff_t old_i_size;
@@ -402,18 +399,13 @@ static inline void ntfs_fault_in_pages_readable_iovec(const struct iovec *iov,
402 * Obtain @nr_pages locked page cache pages from the mapping @mapping and 399 * Obtain @nr_pages locked page cache pages from the mapping @mapping and
403 * starting at index @index. 400 * starting at index @index.
404 * 401 *
405 * If a page is newly created, increment its refcount and add it to the 402 * If a page is newly created, add it to lru list
406 * caller's lru-buffering pagevec @lru_pvec.
407 *
408 * This is the same as mm/filemap.c::__grab_cache_page(), except that @nr_pages
409 * are obtained at once instead of just one page and that 0 is returned on
410 * success and -errno on error.
411 * 403 *
412 * Note, the page locks are obtained in ascending page index order. 404 * Note, the page locks are obtained in ascending page index order.
413 */ 405 */
414static inline int __ntfs_grab_cache_pages(struct address_space *mapping, 406static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
415 pgoff_t index, const unsigned nr_pages, struct page **pages, 407 pgoff_t index, const unsigned nr_pages, struct page **pages,
416 struct page **cached_page, struct pagevec *lru_pvec) 408 struct page **cached_page)
417{ 409{
418 int err, nr; 410 int err, nr;
419 411
@@ -429,7 +421,7 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
429 goto err_out; 421 goto err_out;
430 } 422 }
431 } 423 }
432 err = add_to_page_cache(*cached_page, mapping, index, 424 err = add_to_page_cache_lru(*cached_page, mapping, index,
433 GFP_KERNEL); 425 GFP_KERNEL);
434 if (unlikely(err)) { 426 if (unlikely(err)) {
435 if (err == -EEXIST) 427 if (err == -EEXIST)
@@ -437,9 +429,6 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
437 goto err_out; 429 goto err_out;
438 } 430 }
439 pages[nr] = *cached_page; 431 pages[nr] = *cached_page;
440 page_cache_get(*cached_page);
441 if (unlikely(!pagevec_add(lru_pvec, *cached_page)))
442 __pagevec_lru_add_file(lru_pvec);
443 *cached_page = NULL; 432 *cached_page = NULL;
444 } 433 }
445 index++; 434 index++;
@@ -1799,7 +1788,6 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
1799 ssize_t status, written; 1788 ssize_t status, written;
1800 unsigned nr_pages; 1789 unsigned nr_pages;
1801 int err; 1790 int err;
1802 struct pagevec lru_pvec;
1803 1791
1804 ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, " 1792 ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, "
1805 "pos 0x%llx, count 0x%lx.", 1793 "pos 0x%llx, count 0x%lx.",
@@ -1911,7 +1899,6 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
1911 } 1899 }
1912 } 1900 }
1913 } 1901 }
1914 pagevec_init(&lru_pvec, 0);
1915 written = 0; 1902 written = 0;
1916 /* 1903 /*
1917 * If the write starts beyond the initialized size, extend it up to the 1904 * If the write starts beyond the initialized size, extend it up to the
@@ -1924,8 +1911,7 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
1924 ll = ni->initialized_size; 1911 ll = ni->initialized_size;
1925 read_unlock_irqrestore(&ni->size_lock, flags); 1912 read_unlock_irqrestore(&ni->size_lock, flags);
1926 if (pos > ll) { 1913 if (pos > ll) {
1927 err = ntfs_attr_extend_initialized(ni, pos, &cached_page, 1914 err = ntfs_attr_extend_initialized(ni, pos);
1928 &lru_pvec);
1929 if (err < 0) { 1915 if (err < 0) {
1930 ntfs_error(vol->sb, "Cannot perform write to inode " 1916 ntfs_error(vol->sb, "Cannot perform write to inode "
1931 "0x%lx, attribute type 0x%x, because " 1917 "0x%lx, attribute type 0x%x, because "
@@ -2011,7 +1997,7 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
2011 ntfs_fault_in_pages_readable_iovec(iov, iov_ofs, bytes); 1997 ntfs_fault_in_pages_readable_iovec(iov, iov_ofs, bytes);
2012 /* Get and lock @do_pages starting at index @start_idx. */ 1998 /* Get and lock @do_pages starting at index @start_idx. */
2013 status = __ntfs_grab_cache_pages(mapping, start_idx, do_pages, 1999 status = __ntfs_grab_cache_pages(mapping, start_idx, do_pages,
2014 pages, &cached_page, &lru_pvec); 2000 pages, &cached_page);
2015 if (unlikely(status)) 2001 if (unlikely(status))
2016 break; 2002 break;
2017 /* 2003 /*
@@ -2076,7 +2062,6 @@ err_out:
2076 *ppos = pos; 2062 *ppos = pos;
2077 if (cached_page) 2063 if (cached_page)
2078 page_cache_release(cached_page); 2064 page_cache_release(cached_page);
2079 pagevec_lru_add_file(&lru_pvec);
2080 ntfs_debug("Done. Returning %s (written 0x%lx, status %li).", 2065 ntfs_debug("Done. Returning %s (written 0x%lx, status %li).",
2081 written ? "written" : "status", (unsigned long)written, 2066 written ? "written" : "status", (unsigned long)written,
2082 (long)status); 2067 (long)status);
@@ -2148,7 +2133,6 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2148/** 2133/**
2149 * ntfs_file_fsync - sync a file to disk 2134 * ntfs_file_fsync - sync a file to disk
2150 * @filp: file to be synced 2135 * @filp: file to be synced
2151 * @dentry: dentry describing the file to sync
2152 * @datasync: if non-zero only flush user data and not metadata 2136 * @datasync: if non-zero only flush user data and not metadata
2153 * 2137 *
2154 * Data integrity sync of a file to disk. Used for fsync, fdatasync, and msync 2138 * Data integrity sync of a file to disk. Used for fsync, fdatasync, and msync
@@ -2164,19 +2148,15 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2164 * Also, if @datasync is true, we do not wait on the inode to be written out 2148 * Also, if @datasync is true, we do not wait on the inode to be written out
2165 * but we always wait on the page cache pages to be written out. 2149 * but we always wait on the page cache pages to be written out.
2166 * 2150 *
2167 * Note: In the past @filp could be NULL so we ignore it as we don't need it
2168 * anyway.
2169 *
2170 * Locking: Caller must hold i_mutex on the inode. 2151 * Locking: Caller must hold i_mutex on the inode.
2171 * 2152 *
2172 * TODO: We should probably also write all attribute/index inodes associated 2153 * TODO: We should probably also write all attribute/index inodes associated
2173 * with this inode but since we have no simple way of getting to them we ignore 2154 * with this inode but since we have no simple way of getting to them we ignore
2174 * this problem for now. 2155 * this problem for now.
2175 */ 2156 */
2176static int ntfs_file_fsync(struct file *filp, struct dentry *dentry, 2157static int ntfs_file_fsync(struct file *filp, int datasync)
2177 int datasync)
2178{ 2158{
2179 struct inode *vi = dentry->d_inode; 2159 struct inode *vi = filp->f_mapping->host;
2180 int err, ret = 0; 2160 int err, ret = 0;
2181 2161
2182 ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); 2162 ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
diff --git a/fs/ntfs/index.c b/fs/ntfs/index.c
index 2194eff49743..096c135691ae 100644
--- a/fs/ntfs/index.c
+++ b/fs/ntfs/index.c
@@ -19,6 +19,8 @@
19 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 19 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */ 20 */
21 21
22#include <linux/slab.h>
23
22#include "aops.h" 24#include "aops.h"
23#include "collate.h" 25#include "collate.h"
24#include "debug.h" 26#include "debug.h"
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 1caa0ef0b2bb..b572b6727181 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -21,6 +21,7 @@
21 */ 21 */
22 22
23#include <linux/buffer_head.h> 23#include <linux/buffer_head.h>
24#include <linux/slab.h>
24#include <linux/swap.h> 25#include <linux/swap.h>
25 26
26#include "attrib.h" 27#include "attrib.h"
diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c
index 2ca00153b6ec..358273e59ade 100644
--- a/fs/ntfs/namei.c
+++ b/fs/ntfs/namei.c
@@ -23,6 +23,7 @@
23#include <linux/dcache.h> 23#include <linux/dcache.h>
24#include <linux/exportfs.h> 24#include <linux/exportfs.h>
25#include <linux/security.h> 25#include <linux/security.h>
26#include <linux/slab.h>
26 27
27#include "attrib.h" 28#include "attrib.h"
28#include "debug.h" 29#include "debug.h"
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 791c0886c060..07d9fd854350 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -29,6 +29,7 @@ ocfs2-objs := \
29 mmap.o \ 29 mmap.o \
30 namei.o \ 30 namei.o \
31 refcounttree.o \ 31 refcounttree.o \
32 reservations.o \
32 resize.o \ 33 resize.o \
33 slot_map.o \ 34 slot_map.o \
34 suballoc.o \ 35 suballoc.o \
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 0501974bedd0..da702294d7e7 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -21,6 +21,7 @@
21 21
22#include <linux/init.h> 22#include <linux/init.h>
23#include <linux/module.h> 23#include <linux/module.h>
24#include <linux/slab.h>
24#include <linux/string.h> 25#include <linux/string.h>
25 26
26#define MLOG_MASK_PREFIX ML_INODE 27#define MLOG_MASK_PREFIX ML_INODE
@@ -30,6 +31,8 @@
30#include "alloc.h" 31#include "alloc.h"
31#include "dlmglue.h" 32#include "dlmglue.h"
32#include "file.h" 33#include "file.h"
34#include "inode.h"
35#include "journal.h"
33#include "ocfs2_fs.h" 36#include "ocfs2_fs.h"
34 37
35#include "xattr.h" 38#include "xattr.h"
@@ -166,6 +169,60 @@ static struct posix_acl *ocfs2_get_acl(struct inode *inode, int type)
166} 169}
167 170
168/* 171/*
172 * Helper function to set i_mode in memory and disk. Some call paths
173 * will not have di_bh or a journal handle to pass, in which case it
174 * will create it's own.
175 */
176static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head *di_bh,
177 handle_t *handle, umode_t new_mode)
178{
179 int ret, commit_handle = 0;
180 struct ocfs2_dinode *di;
181
182 if (di_bh == NULL) {
183 ret = ocfs2_read_inode_block(inode, &di_bh);
184 if (ret) {
185 mlog_errno(ret);
186 goto out;
187 }
188 } else
189 get_bh(di_bh);
190
191 if (handle == NULL) {
192 handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb),
193 OCFS2_INODE_UPDATE_CREDITS);
194 if (IS_ERR(handle)) {
195 ret = PTR_ERR(handle);
196 mlog_errno(ret);
197 goto out_brelse;
198 }
199
200 commit_handle = 1;
201 }
202
203 di = (struct ocfs2_dinode *)di_bh->b_data;
204 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
205 OCFS2_JOURNAL_ACCESS_WRITE);
206 if (ret) {
207 mlog_errno(ret);
208 goto out_commit;
209 }
210
211 inode->i_mode = new_mode;
212 di->i_mode = cpu_to_le16(inode->i_mode);
213
214 ocfs2_journal_dirty(handle, di_bh);
215
216out_commit:
217 if (commit_handle)
218 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
219out_brelse:
220 brelse(di_bh);
221out:
222 return ret;
223}
224
225/*
169 * Set the access or default ACL of an inode. 226 * Set the access or default ACL of an inode.
170 */ 227 */
171static int ocfs2_set_acl(handle_t *handle, 228static int ocfs2_set_acl(handle_t *handle,
@@ -193,9 +250,14 @@ static int ocfs2_set_acl(handle_t *handle,
193 if (ret < 0) 250 if (ret < 0)
194 return ret; 251 return ret;
195 else { 252 else {
196 inode->i_mode = mode;
197 if (ret == 0) 253 if (ret == 0)
198 acl = NULL; 254 acl = NULL;
255
256 ret = ocfs2_acl_set_mode(inode, di_bh,
257 handle, mode);
258 if (ret)
259 return ret;
260
199 } 261 }
200 } 262 }
201 break; 263 break;
@@ -283,6 +345,7 @@ int ocfs2_init_acl(handle_t *handle,
283 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 345 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
284 struct posix_acl *acl = NULL; 346 struct posix_acl *acl = NULL;
285 int ret = 0; 347 int ret = 0;
348 mode_t mode;
286 349
287 if (!S_ISLNK(inode->i_mode)) { 350 if (!S_ISLNK(inode->i_mode)) {
288 if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) { 351 if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
@@ -291,12 +354,17 @@ int ocfs2_init_acl(handle_t *handle,
291 if (IS_ERR(acl)) 354 if (IS_ERR(acl))
292 return PTR_ERR(acl); 355 return PTR_ERR(acl);
293 } 356 }
294 if (!acl) 357 if (!acl) {
295 inode->i_mode &= ~current_umask(); 358 mode = inode->i_mode & ~current_umask();
359 ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
360 if (ret) {
361 mlog_errno(ret);
362 goto cleanup;
363 }
364 }
296 } 365 }
297 if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) { 366 if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) {
298 struct posix_acl *clone; 367 struct posix_acl *clone;
299 mode_t mode;
300 368
301 if (S_ISDIR(inode->i_mode)) { 369 if (S_ISDIR(inode->i_mode)) {
302 ret = ocfs2_set_acl(handle, inode, di_bh, 370 ret = ocfs2_set_acl(handle, inode, di_bh,
@@ -313,7 +381,7 @@ int ocfs2_init_acl(handle_t *handle,
313 mode = inode->i_mode; 381 mode = inode->i_mode;
314 ret = posix_acl_create_masq(clone, &mode); 382 ret = posix_acl_create_masq(clone, &mode);
315 if (ret >= 0) { 383 if (ret >= 0) {
316 inode->i_mode = mode; 384 ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
317 if (ret > 0) { 385 if (ret > 0) {
318 ret = ocfs2_set_acl(handle, inode, 386 ret = ocfs2_set_acl(handle, inode,
319 di_bh, ACL_TYPE_ACCESS, 387 di_bh, ACL_TYPE_ACCESS,
@@ -421,7 +489,7 @@ cleanup:
421 return ret; 489 return ret;
422} 490}
423 491
424struct xattr_handler ocfs2_xattr_acl_access_handler = { 492const struct xattr_handler ocfs2_xattr_acl_access_handler = {
425 .prefix = POSIX_ACL_XATTR_ACCESS, 493 .prefix = POSIX_ACL_XATTR_ACCESS,
426 .flags = ACL_TYPE_ACCESS, 494 .flags = ACL_TYPE_ACCESS,
427 .list = ocfs2_xattr_list_acl_access, 495 .list = ocfs2_xattr_list_acl_access,
@@ -429,7 +497,7 @@ struct xattr_handler ocfs2_xattr_acl_access_handler = {
429 .set = ocfs2_xattr_set_acl, 497 .set = ocfs2_xattr_set_acl,
430}; 498};
431 499
432struct xattr_handler ocfs2_xattr_acl_default_handler = { 500const struct xattr_handler ocfs2_xattr_acl_default_handler = {
433 .prefix = POSIX_ACL_XATTR_DEFAULT, 501 .prefix = POSIX_ACL_XATTR_DEFAULT,
434 .flags = ACL_TYPE_DEFAULT, 502 .flags = ACL_TYPE_DEFAULT,
435 .list = ocfs2_xattr_list_acl_default, 503 .list = ocfs2_xattr_list_acl_default,
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 9f8bd913c51e..215e12ce1d85 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -1006,7 +1006,7 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle,
1006 int count, status, i; 1006 int count, status, i;
1007 u16 suballoc_bit_start; 1007 u16 suballoc_bit_start;
1008 u32 num_got; 1008 u32 num_got;
1009 u64 first_blkno; 1009 u64 suballoc_loc, first_blkno;
1010 struct ocfs2_super *osb = 1010 struct ocfs2_super *osb =
1011 OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci)); 1011 OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
1012 struct ocfs2_extent_block *eb; 1012 struct ocfs2_extent_block *eb;
@@ -1015,10 +1015,10 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle,
1015 1015
1016 count = 0; 1016 count = 0;
1017 while (count < wanted) { 1017 while (count < wanted) {
1018 status = ocfs2_claim_metadata(osb, 1018 status = ocfs2_claim_metadata(handle,
1019 handle,
1020 meta_ac, 1019 meta_ac,
1021 wanted - count, 1020 wanted - count,
1021 &suballoc_loc,
1022 &suballoc_bit_start, 1022 &suballoc_bit_start,
1023 &num_got, 1023 &num_got,
1024 &first_blkno); 1024 &first_blkno);
@@ -1052,6 +1052,7 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle,
1052 eb->h_fs_generation = cpu_to_le32(osb->fs_generation); 1052 eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
1053 eb->h_suballoc_slot = 1053 eb->h_suballoc_slot =
1054 cpu_to_le16(meta_ac->ac_alloc_slot); 1054 cpu_to_le16(meta_ac->ac_alloc_slot);
1055 eb->h_suballoc_loc = cpu_to_le64(suballoc_loc);
1055 eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start); 1056 eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
1056 eb->h_list.l_count = 1057 eb->h_list.l_count =
1057 cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb)); 1058 cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
@@ -1061,11 +1062,7 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle,
1061 1062
1062 /* We'll also be dirtied by the caller, so 1063 /* We'll also be dirtied by the caller, so
1063 * this isn't absolutely necessary. */ 1064 * this isn't absolutely necessary. */
1064 status = ocfs2_journal_dirty(handle, bhs[i]); 1065 ocfs2_journal_dirty(handle, bhs[i]);
1065 if (status < 0) {
1066 mlog_errno(status);
1067 goto bail;
1068 }
1069 } 1066 }
1070 1067
1071 count += num_got; 1068 count += num_got;
@@ -1129,8 +1126,7 @@ static int ocfs2_adjust_rightmost_branch(handle_t *handle,
1129 goto out; 1126 goto out;
1130 } 1127 }
1131 1128
1132 status = ocfs2_extend_trans(handle, path_num_items(path) + 1129 status = ocfs2_extend_trans(handle, path_num_items(path));
1133 handle->h_buffer_credits);
1134 if (status < 0) { 1130 if (status < 0) {
1135 mlog_errno(status); 1131 mlog_errno(status);
1136 goto out; 1132 goto out;
@@ -1270,12 +1266,7 @@ static int ocfs2_add_branch(handle_t *handle,
1270 if (!eb_el->l_tree_depth) 1266 if (!eb_el->l_tree_depth)
1271 new_last_eb_blk = le64_to_cpu(eb->h_blkno); 1267 new_last_eb_blk = le64_to_cpu(eb->h_blkno);
1272 1268
1273 status = ocfs2_journal_dirty(handle, bh); 1269 ocfs2_journal_dirty(handle, bh);
1274 if (status < 0) {
1275 mlog_errno(status);
1276 goto bail;
1277 }
1278
1279 next_blkno = le64_to_cpu(eb->h_blkno); 1270 next_blkno = le64_to_cpu(eb->h_blkno);
1280 } 1271 }
1281 1272
@@ -1321,17 +1312,10 @@ static int ocfs2_add_branch(handle_t *handle,
1321 eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data; 1312 eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
1322 eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk); 1313 eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
1323 1314
1324 status = ocfs2_journal_dirty(handle, *last_eb_bh); 1315 ocfs2_journal_dirty(handle, *last_eb_bh);
1325 if (status < 0) 1316 ocfs2_journal_dirty(handle, et->et_root_bh);
1326 mlog_errno(status); 1317 if (eb_bh)
1327 status = ocfs2_journal_dirty(handle, et->et_root_bh); 1318 ocfs2_journal_dirty(handle, eb_bh);
1328 if (status < 0)
1329 mlog_errno(status);
1330 if (eb_bh) {
1331 status = ocfs2_journal_dirty(handle, eb_bh);
1332 if (status < 0)
1333 mlog_errno(status);
1334 }
1335 1319
1336 /* 1320 /*
1337 * Some callers want to track the rightmost leaf so pass it 1321 * Some callers want to track the rightmost leaf so pass it
@@ -1399,11 +1383,7 @@ static int ocfs2_shift_tree_depth(handle_t *handle,
1399 for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++) 1383 for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++)
1400 eb_el->l_recs[i] = root_el->l_recs[i]; 1384 eb_el->l_recs[i] = root_el->l_recs[i];
1401 1385
1402 status = ocfs2_journal_dirty(handle, new_eb_bh); 1386 ocfs2_journal_dirty(handle, new_eb_bh);
1403 if (status < 0) {
1404 mlog_errno(status);
1405 goto bail;
1406 }
1407 1387
1408 status = ocfs2_et_root_journal_access(handle, et, 1388 status = ocfs2_et_root_journal_access(handle, et,
1409 OCFS2_JOURNAL_ACCESS_WRITE); 1389 OCFS2_JOURNAL_ACCESS_WRITE);
@@ -1428,11 +1408,7 @@ static int ocfs2_shift_tree_depth(handle_t *handle,
1428 if (root_el->l_tree_depth == cpu_to_le16(1)) 1408 if (root_el->l_tree_depth == cpu_to_le16(1))
1429 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno)); 1409 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
1430 1410
1431 status = ocfs2_journal_dirty(handle, et->et_root_bh); 1411 ocfs2_journal_dirty(handle, et->et_root_bh);
1432 if (status < 0) {
1433 mlog_errno(status);
1434 goto bail;
1435 }
1436 1412
1437 *ret_new_eb_bh = new_eb_bh; 1413 *ret_new_eb_bh = new_eb_bh;
1438 new_eb_bh = NULL; 1414 new_eb_bh = NULL;
@@ -2064,7 +2040,7 @@ static void ocfs2_complete_edge_insert(handle_t *handle,
2064 struct ocfs2_path *right_path, 2040 struct ocfs2_path *right_path,
2065 int subtree_index) 2041 int subtree_index)
2066{ 2042{
2067 int ret, i, idx; 2043 int i, idx;
2068 struct ocfs2_extent_list *el, *left_el, *right_el; 2044 struct ocfs2_extent_list *el, *left_el, *right_el;
2069 struct ocfs2_extent_rec *left_rec, *right_rec; 2045 struct ocfs2_extent_rec *left_rec, *right_rec;
2070 struct buffer_head *root_bh = left_path->p_node[subtree_index].bh; 2046 struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
@@ -2102,13 +2078,8 @@ static void ocfs2_complete_edge_insert(handle_t *handle,
2102 ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec, 2078 ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec,
2103 right_el); 2079 right_el);
2104 2080
2105 ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh); 2081 ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
2106 if (ret) 2082 ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
2107 mlog_errno(ret);
2108
2109 ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
2110 if (ret)
2111 mlog_errno(ret);
2112 2083
2113 /* 2084 /*
2114 * Setup our list pointers now so that the current 2085 * Setup our list pointers now so that the current
@@ -2132,9 +2103,7 @@ static void ocfs2_complete_edge_insert(handle_t *handle,
2132 2103
2133 root_bh = left_path->p_node[subtree_index].bh; 2104 root_bh = left_path->p_node[subtree_index].bh;
2134 2105
2135 ret = ocfs2_journal_dirty(handle, root_bh); 2106 ocfs2_journal_dirty(handle, root_bh);
2136 if (ret)
2137 mlog_errno(ret);
2138} 2107}
2139 2108
2140static int ocfs2_rotate_subtree_right(handle_t *handle, 2109static int ocfs2_rotate_subtree_right(handle_t *handle,
@@ -2207,11 +2176,7 @@ static int ocfs2_rotate_subtree_right(handle_t *handle,
2207 2176
2208 ocfs2_create_empty_extent(right_el); 2177 ocfs2_create_empty_extent(right_el);
2209 2178
2210 ret = ocfs2_journal_dirty(handle, right_leaf_bh); 2179 ocfs2_journal_dirty(handle, right_leaf_bh);
2211 if (ret) {
2212 mlog_errno(ret);
2213 goto out;
2214 }
2215 2180
2216 /* Do the copy now. */ 2181 /* Do the copy now. */
2217 i = le16_to_cpu(left_el->l_next_free_rec) - 1; 2182 i = le16_to_cpu(left_el->l_next_free_rec) - 1;
@@ -2230,11 +2195,7 @@ static int ocfs2_rotate_subtree_right(handle_t *handle,
2230 memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); 2195 memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2231 le16_add_cpu(&left_el->l_next_free_rec, 1); 2196 le16_add_cpu(&left_el->l_next_free_rec, 1);
2232 2197
2233 ret = ocfs2_journal_dirty(handle, left_leaf_bh); 2198 ocfs2_journal_dirty(handle, left_leaf_bh);
2234 if (ret) {
2235 mlog_errno(ret);
2236 goto out;
2237 }
2238 2199
2239 ocfs2_complete_edge_insert(handle, left_path, right_path, 2200 ocfs2_complete_edge_insert(handle, left_path, right_path,
2240 subtree_index); 2201 subtree_index);
@@ -2249,8 +2210,8 @@ out:
2249 * 2210 *
2250 * Will return zero if the path passed in is already the leftmost path. 2211 * Will return zero if the path passed in is already the leftmost path.
2251 */ 2212 */
2252static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb, 2213int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
2253 struct ocfs2_path *path, u32 *cpos) 2214 struct ocfs2_path *path, u32 *cpos)
2254{ 2215{
2255 int i, j, ret = 0; 2216 int i, j, ret = 0;
2256 u64 blkno; 2217 u64 blkno;
@@ -2327,20 +2288,14 @@ static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
2327 int op_credits, 2288 int op_credits,
2328 struct ocfs2_path *path) 2289 struct ocfs2_path *path)
2329{ 2290{
2330 int ret; 2291 int ret = 0;
2331 int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits; 2292 int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
2332 2293
2333 if (handle->h_buffer_credits < credits) { 2294 if (handle->h_buffer_credits < credits)
2334 ret = ocfs2_extend_trans(handle, 2295 ret = ocfs2_extend_trans(handle,
2335 credits - handle->h_buffer_credits); 2296 credits - handle->h_buffer_credits);
2336 if (ret)
2337 return ret;
2338 2297
2339 if (unlikely(handle->h_buffer_credits < credits)) 2298 return ret;
2340 return ocfs2_extend_trans(handle, credits);
2341 }
2342
2343 return 0;
2344} 2299}
2345 2300
2346/* 2301/*
@@ -2584,8 +2539,7 @@ static int ocfs2_update_edge_lengths(handle_t *handle,
2584 * records for all the bh in the path. 2539 * records for all the bh in the path.
2585 * So we have to allocate extra credits and access them. 2540 * So we have to allocate extra credits and access them.
2586 */ 2541 */
2587 ret = ocfs2_extend_trans(handle, 2542 ret = ocfs2_extend_trans(handle, subtree_index);
2588 handle->h_buffer_credits + subtree_index);
2589 if (ret) { 2543 if (ret) {
2590 mlog_errno(ret); 2544 mlog_errno(ret);
2591 goto out; 2545 goto out;
@@ -2823,12 +2777,8 @@ static int ocfs2_rotate_subtree_left(handle_t *handle,
2823 ocfs2_remove_empty_extent(right_leaf_el); 2777 ocfs2_remove_empty_extent(right_leaf_el);
2824 } 2778 }
2825 2779
2826 ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path)); 2780 ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2827 if (ret) 2781 ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
2828 mlog_errno(ret);
2829 ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
2830 if (ret)
2831 mlog_errno(ret);
2832 2782
2833 if (del_right_subtree) { 2783 if (del_right_subtree) {
2834 ocfs2_unlink_subtree(handle, et, left_path, right_path, 2784 ocfs2_unlink_subtree(handle, et, left_path, right_path,
@@ -2851,9 +2801,7 @@ static int ocfs2_rotate_subtree_left(handle_t *handle,
2851 if (right_has_empty) 2801 if (right_has_empty)
2852 ocfs2_remove_empty_extent(left_leaf_el); 2802 ocfs2_remove_empty_extent(left_leaf_el);
2853 2803
2854 ret = ocfs2_journal_dirty(handle, et_root_bh); 2804 ocfs2_journal_dirty(handle, et_root_bh);
2855 if (ret)
2856 mlog_errno(ret);
2857 2805
2858 *deleted = 1; 2806 *deleted = 1;
2859 } else 2807 } else
@@ -2962,10 +2910,7 @@ static int ocfs2_rotate_rightmost_leaf_left(handle_t *handle,
2962 } 2910 }
2963 2911
2964 ocfs2_remove_empty_extent(el); 2912 ocfs2_remove_empty_extent(el);
2965 2913 ocfs2_journal_dirty(handle, bh);
2966 ret = ocfs2_journal_dirty(handle, bh);
2967 if (ret)
2968 mlog_errno(ret);
2969 2914
2970out: 2915out:
2971 return ret; 2916 return ret;
@@ -3506,15 +3451,9 @@ static int ocfs2_merge_rec_right(struct ocfs2_path *left_path,
3506 3451
3507 ocfs2_cleanup_merge(el, index); 3452 ocfs2_cleanup_merge(el, index);
3508 3453
3509 ret = ocfs2_journal_dirty(handle, bh); 3454 ocfs2_journal_dirty(handle, bh);
3510 if (ret)
3511 mlog_errno(ret);
3512
3513 if (right_path) { 3455 if (right_path) {
3514 ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path)); 3456 ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
3515 if (ret)
3516 mlog_errno(ret);
3517
3518 ocfs2_complete_edge_insert(handle, left_path, right_path, 3457 ocfs2_complete_edge_insert(handle, left_path, right_path,
3519 subtree_index); 3458 subtree_index);
3520 } 3459 }
@@ -3683,14 +3622,9 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
3683 3622
3684 ocfs2_cleanup_merge(el, index); 3623 ocfs2_cleanup_merge(el, index);
3685 3624
3686 ret = ocfs2_journal_dirty(handle, bh); 3625 ocfs2_journal_dirty(handle, bh);
3687 if (ret)
3688 mlog_errno(ret);
3689
3690 if (left_path) { 3626 if (left_path) {
3691 ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path)); 3627 ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
3692 if (ret)
3693 mlog_errno(ret);
3694 3628
3695 /* 3629 /*
3696 * In the situation that the right_rec is empty and the extent 3630 * In the situation that the right_rec is empty and the extent
@@ -4016,10 +3950,7 @@ static void ocfs2_adjust_rightmost_records(handle_t *handle,
4016 le32_add_cpu(&rec->e_int_clusters, 3950 le32_add_cpu(&rec->e_int_clusters,
4017 -le32_to_cpu(rec->e_cpos)); 3951 -le32_to_cpu(rec->e_cpos));
4018 3952
4019 ret = ocfs2_journal_dirty(handle, bh); 3953 ocfs2_journal_dirty(handle, bh);
4020 if (ret)
4021 mlog_errno(ret);
4022
4023 } 3954 }
4024} 3955}
4025 3956
@@ -4203,17 +4134,13 @@ static int ocfs2_insert_path(handle_t *handle,
4203 struct buffer_head *leaf_bh = path_leaf_bh(right_path); 4134 struct buffer_head *leaf_bh = path_leaf_bh(right_path);
4204 4135
4205 if (left_path) { 4136 if (left_path) {
4206 int credits = handle->h_buffer_credits;
4207
4208 /* 4137 /*
4209 * There's a chance that left_path got passed back to 4138 * There's a chance that left_path got passed back to
4210 * us without being accounted for in the 4139 * us without being accounted for in the
4211 * journal. Extend our transaction here to be sure we 4140 * journal. Extend our transaction here to be sure we
4212 * can change those blocks. 4141 * can change those blocks.
4213 */ 4142 */
4214 credits += left_path->p_tree_depth; 4143 ret = ocfs2_extend_trans(handle, left_path->p_tree_depth);
4215
4216 ret = ocfs2_extend_trans(handle, credits);
4217 if (ret < 0) { 4144 if (ret < 0) {
4218 mlog_errno(ret); 4145 mlog_errno(ret);
4219 goto out; 4146 goto out;
@@ -4251,17 +4178,13 @@ static int ocfs2_insert_path(handle_t *handle,
4251 * dirty this for us. 4178 * dirty this for us.
4252 */ 4179 */
4253 if (left_path) 4180 if (left_path)
4254 ret = ocfs2_journal_dirty(handle, 4181 ocfs2_journal_dirty(handle,
4255 path_leaf_bh(left_path)); 4182 path_leaf_bh(left_path));
4256 if (ret)
4257 mlog_errno(ret);
4258 } else 4183 } else
4259 ocfs2_insert_at_leaf(et, insert_rec, path_leaf_el(right_path), 4184 ocfs2_insert_at_leaf(et, insert_rec, path_leaf_el(right_path),
4260 insert); 4185 insert);
4261 4186
4262 ret = ocfs2_journal_dirty(handle, leaf_bh); 4187 ocfs2_journal_dirty(handle, leaf_bh);
4263 if (ret)
4264 mlog_errno(ret);
4265 4188
4266 if (left_path) { 4189 if (left_path) {
4267 /* 4190 /*
@@ -4384,9 +4307,7 @@ out_update_clusters:
4384 ocfs2_et_update_clusters(et, 4307 ocfs2_et_update_clusters(et,
4385 le16_to_cpu(insert_rec->e_leaf_clusters)); 4308 le16_to_cpu(insert_rec->e_leaf_clusters));
4386 4309
4387 ret = ocfs2_journal_dirty(handle, et->et_root_bh); 4310 ocfs2_journal_dirty(handle, et->et_root_bh);
4388 if (ret)
4389 mlog_errno(ret);
4390 4311
4391out: 4312out:
4392 ocfs2_free_path(left_path); 4313 ocfs2_free_path(left_path);
@@ -4866,7 +4787,7 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
4866 goto leave; 4787 goto leave;
4867 } 4788 }
4868 4789
4869 status = __ocfs2_claim_clusters(osb, handle, data_ac, 1, 4790 status = __ocfs2_claim_clusters(handle, data_ac, 1,
4870 clusters_to_add, &bit_off, &num_bits); 4791 clusters_to_add, &bit_off, &num_bits);
4871 if (status < 0) { 4792 if (status < 0) {
4872 if (status != -ENOSPC) 4793 if (status != -ENOSPC)
@@ -4895,11 +4816,7 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
4895 goto leave; 4816 goto leave;
4896 } 4817 }
4897 4818
4898 status = ocfs2_journal_dirty(handle, et->et_root_bh); 4819 ocfs2_journal_dirty(handle, et->et_root_bh);
4899 if (status < 0) {
4900 mlog_errno(status);
4901 goto leave;
4902 }
4903 4820
4904 clusters_to_add -= num_bits; 4821 clusters_to_add -= num_bits;
4905 *logical_offset += num_bits; 4822 *logical_offset += num_bits;
@@ -5309,7 +5226,7 @@ static int ocfs2_split_tree(handle_t *handle, struct ocfs2_extent_tree *et,
5309 int index, u32 new_range, 5226 int index, u32 new_range,
5310 struct ocfs2_alloc_context *meta_ac) 5227 struct ocfs2_alloc_context *meta_ac)
5311{ 5228{
5312 int ret, depth, credits = handle->h_buffer_credits; 5229 int ret, depth, credits;
5313 struct buffer_head *last_eb_bh = NULL; 5230 struct buffer_head *last_eb_bh = NULL;
5314 struct ocfs2_extent_block *eb; 5231 struct ocfs2_extent_block *eb;
5315 struct ocfs2_extent_list *rightmost_el, *el; 5232 struct ocfs2_extent_list *rightmost_el, *el;
@@ -5340,8 +5257,8 @@ static int ocfs2_split_tree(handle_t *handle, struct ocfs2_extent_tree *et,
5340 } else 5257 } else
5341 rightmost_el = path_leaf_el(path); 5258 rightmost_el = path_leaf_el(path);
5342 5259
5343 credits += path->p_tree_depth + 5260 credits = path->p_tree_depth +
5344 ocfs2_extend_meta_needed(et->et_root_el); 5261 ocfs2_extend_meta_needed(et->et_root_el);
5345 ret = ocfs2_extend_trans(handle, credits); 5262 ret = ocfs2_extend_trans(handle, credits);
5346 if (ret) { 5263 if (ret) {
5347 mlog_errno(ret); 5264 mlog_errno(ret);
@@ -5671,19 +5588,97 @@ out:
5671 return ret; 5588 return ret;
5672} 5589}
5673 5590
5591/*
5592 * ocfs2_reserve_blocks_for_rec_trunc() would look basically the
5593 * same as ocfs2_lock_alloctors(), except for it accepts a blocks
5594 * number to reserve some extra blocks, and it only handles meta
5595 * data allocations.
5596 *
5597 * Currently, only ocfs2_remove_btree_range() uses it for truncating
5598 * and punching holes.
5599 */
5600static int ocfs2_reserve_blocks_for_rec_trunc(struct inode *inode,
5601 struct ocfs2_extent_tree *et,
5602 u32 extents_to_split,
5603 struct ocfs2_alloc_context **ac,
5604 int extra_blocks)
5605{
5606 int ret = 0, num_free_extents;
5607 unsigned int max_recs_needed = 2 * extents_to_split;
5608 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
5609
5610 *ac = NULL;
5611
5612 num_free_extents = ocfs2_num_free_extents(osb, et);
5613 if (num_free_extents < 0) {
5614 ret = num_free_extents;
5615 mlog_errno(ret);
5616 goto out;
5617 }
5618
5619 if (!num_free_extents ||
5620 (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed))
5621 extra_blocks += ocfs2_extend_meta_needed(et->et_root_el);
5622
5623 if (extra_blocks) {
5624 ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, ac);
5625 if (ret < 0) {
5626 if (ret != -ENOSPC)
5627 mlog_errno(ret);
5628 goto out;
5629 }
5630 }
5631
5632out:
5633 if (ret) {
5634 if (*ac) {
5635 ocfs2_free_alloc_context(*ac);
5636 *ac = NULL;
5637 }
5638 }
5639
5640 return ret;
5641}
5642
5674int ocfs2_remove_btree_range(struct inode *inode, 5643int ocfs2_remove_btree_range(struct inode *inode,
5675 struct ocfs2_extent_tree *et, 5644 struct ocfs2_extent_tree *et,
5676 u32 cpos, u32 phys_cpos, u32 len, 5645 u32 cpos, u32 phys_cpos, u32 len, int flags,
5677 struct ocfs2_cached_dealloc_ctxt *dealloc) 5646 struct ocfs2_cached_dealloc_ctxt *dealloc,
5647 u64 refcount_loc)
5678{ 5648{
5679 int ret; 5649 int ret, credits = 0, extra_blocks = 0;
5680 u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); 5650 u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
5681 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 5651 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
5682 struct inode *tl_inode = osb->osb_tl_inode; 5652 struct inode *tl_inode = osb->osb_tl_inode;
5683 handle_t *handle; 5653 handle_t *handle;
5684 struct ocfs2_alloc_context *meta_ac = NULL; 5654 struct ocfs2_alloc_context *meta_ac = NULL;
5655 struct ocfs2_refcount_tree *ref_tree = NULL;
5656
5657 if ((flags & OCFS2_EXT_REFCOUNTED) && len) {
5658 BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
5659 OCFS2_HAS_REFCOUNT_FL));
5660
5661 ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1,
5662 &ref_tree, NULL);
5663 if (ret) {
5664 mlog_errno(ret);
5665 goto out;
5666 }
5685 5667
5686 ret = ocfs2_lock_allocators(inode, et, 0, 1, NULL, &meta_ac); 5668 ret = ocfs2_prepare_refcount_change_for_del(inode,
5669 refcount_loc,
5670 phys_blkno,
5671 len,
5672 &credits,
5673 &extra_blocks);
5674 if (ret < 0) {
5675 mlog_errno(ret);
5676 goto out;
5677 }
5678 }
5679
5680 ret = ocfs2_reserve_blocks_for_rec_trunc(inode, et, 1, &meta_ac,
5681 extra_blocks);
5687 if (ret) { 5682 if (ret) {
5688 mlog_errno(ret); 5683 mlog_errno(ret);
5689 return ret; 5684 return ret;
@@ -5699,7 +5694,8 @@ int ocfs2_remove_btree_range(struct inode *inode,
5699 } 5694 }
5700 } 5695 }
5701 5696
5702 handle = ocfs2_start_trans(osb, ocfs2_remove_extent_credits(osb->sb)); 5697 handle = ocfs2_start_trans(osb,
5698 ocfs2_remove_extent_credits(osb->sb) + credits);
5703 if (IS_ERR(handle)) { 5699 if (IS_ERR(handle)) {
5704 ret = PTR_ERR(handle); 5700 ret = PTR_ERR(handle);
5705 mlog_errno(ret); 5701 mlog_errno(ret);
@@ -5724,15 +5720,22 @@ int ocfs2_remove_btree_range(struct inode *inode,
5724 5720
5725 ocfs2_et_update_clusters(et, -len); 5721 ocfs2_et_update_clusters(et, -len);
5726 5722
5727 ret = ocfs2_journal_dirty(handle, et->et_root_bh); 5723 ocfs2_journal_dirty(handle, et->et_root_bh);
5728 if (ret) {
5729 mlog_errno(ret);
5730 goto out_commit;
5731 }
5732 5724
5733 ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len); 5725 if (phys_blkno) {
5734 if (ret) 5726 if (flags & OCFS2_EXT_REFCOUNTED)
5735 mlog_errno(ret); 5727 ret = ocfs2_decrease_refcount(inode, handle,
5728 ocfs2_blocks_to_clusters(osb->sb,
5729 phys_blkno),
5730 len, meta_ac,
5731 dealloc, 1);
5732 else
5733 ret = ocfs2_truncate_log_append(osb, handle,
5734 phys_blkno, len);
5735 if (ret)
5736 mlog_errno(ret);
5737
5738 }
5736 5739
5737out_commit: 5740out_commit:
5738 ocfs2_commit_trans(osb, handle); 5741 ocfs2_commit_trans(osb, handle);
@@ -5742,6 +5745,9 @@ out:
5742 if (meta_ac) 5745 if (meta_ac)
5743 ocfs2_free_alloc_context(meta_ac); 5746 ocfs2_free_alloc_context(meta_ac);
5744 5747
5748 if (ref_tree)
5749 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
5750
5745 return ret; 5751 return ret;
5746} 5752}
5747 5753
@@ -5850,11 +5856,7 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
5850 } 5856 }
5851 tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters); 5857 tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
5852 5858
5853 status = ocfs2_journal_dirty(handle, tl_bh); 5859 ocfs2_journal_dirty(handle, tl_bh);
5854 if (status < 0) {
5855 mlog_errno(status);
5856 goto bail;
5857 }
5858 5860
5859bail: 5861bail:
5860 mlog_exit(status); 5862 mlog_exit(status);
@@ -5893,11 +5895,7 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
5893 5895
5894 tl->tl_used = cpu_to_le16(i); 5896 tl->tl_used = cpu_to_le16(i);
5895 5897
5896 status = ocfs2_journal_dirty(handle, tl_bh); 5898 ocfs2_journal_dirty(handle, tl_bh);
5897 if (status < 0) {
5898 mlog_errno(status);
5899 goto bail;
5900 }
5901 5899
5902 /* TODO: Perhaps we can calculate the bulk of the 5900 /* TODO: Perhaps we can calculate the bulk of the
5903 * credits up front rather than extending like 5901 * credits up front rather than extending like
@@ -6298,6 +6296,7 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb)
6298 */ 6296 */
6299struct ocfs2_cached_block_free { 6297struct ocfs2_cached_block_free {
6300 struct ocfs2_cached_block_free *free_next; 6298 struct ocfs2_cached_block_free *free_next;
6299 u64 free_bg;
6301 u64 free_blk; 6300 u64 free_blk;
6302 unsigned int free_bit; 6301 unsigned int free_bit;
6303}; 6302};
@@ -6344,8 +6343,11 @@ static int ocfs2_free_cached_blocks(struct ocfs2_super *osb,
6344 } 6343 }
6345 6344
6346 while (head) { 6345 while (head) {
6347 bg_blkno = ocfs2_which_suballoc_group(head->free_blk, 6346 if (head->free_bg)
6348 head->free_bit); 6347 bg_blkno = head->free_bg;
6348 else
6349 bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
6350 head->free_bit);
6349 mlog(0, "Free bit: (bit %u, blkno %llu)\n", 6351 mlog(0, "Free bit: (bit %u, blkno %llu)\n",
6350 head->free_bit, (unsigned long long)head->free_blk); 6352 head->free_bit, (unsigned long long)head->free_blk);
6351 6353
@@ -6393,7 +6395,7 @@ int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
6393 int ret = 0; 6395 int ret = 0;
6394 struct ocfs2_cached_block_free *item; 6396 struct ocfs2_cached_block_free *item;
6395 6397
6396 item = kmalloc(sizeof(*item), GFP_NOFS); 6398 item = kzalloc(sizeof(*item), GFP_NOFS);
6397 if (item == NULL) { 6399 if (item == NULL) {
6398 ret = -ENOMEM; 6400 ret = -ENOMEM;
6399 mlog_errno(ret); 6401 mlog_errno(ret);
@@ -6533,8 +6535,8 @@ ocfs2_find_per_slot_free_list(int type,
6533} 6535}
6534 6536
6535int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, 6537int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
6536 int type, int slot, u64 blkno, 6538 int type, int slot, u64 suballoc,
6537 unsigned int bit) 6539 u64 blkno, unsigned int bit)
6538{ 6540{
6539 int ret; 6541 int ret;
6540 struct ocfs2_per_slot_free_list *fl; 6542 struct ocfs2_per_slot_free_list *fl;
@@ -6547,7 +6549,7 @@ int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
6547 goto out; 6549 goto out;
6548 } 6550 }
6549 6551
6550 item = kmalloc(sizeof(*item), GFP_NOFS); 6552 item = kzalloc(sizeof(*item), GFP_NOFS);
6551 if (item == NULL) { 6553 if (item == NULL) {
6552 ret = -ENOMEM; 6554 ret = -ENOMEM;
6553 mlog_errno(ret); 6555 mlog_errno(ret);
@@ -6557,6 +6559,7 @@ int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
6557 mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n", 6559 mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n",
6558 type, slot, bit, (unsigned long long)blkno); 6560 type, slot, bit, (unsigned long long)blkno);
6559 6561
6562 item->free_bg = suballoc;
6560 item->free_blk = blkno; 6563 item->free_blk = blkno;
6561 item->free_bit = bit; 6564 item->free_bit = bit;
6562 item->free_next = fl->f_first; 6565 item->free_next = fl->f_first;
@@ -6573,433 +6576,11 @@ static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
6573{ 6576{
6574 return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE, 6577 return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE,
6575 le16_to_cpu(eb->h_suballoc_slot), 6578 le16_to_cpu(eb->h_suballoc_slot),
6579 le64_to_cpu(eb->h_suballoc_loc),
6576 le64_to_cpu(eb->h_blkno), 6580 le64_to_cpu(eb->h_blkno),
6577 le16_to_cpu(eb->h_suballoc_bit)); 6581 le16_to_cpu(eb->h_suballoc_bit));
6578} 6582}
6579 6583
6580/* This function will figure out whether the currently last extent
6581 * block will be deleted, and if it will, what the new last extent
6582 * block will be so we can update his h_next_leaf_blk field, as well
6583 * as the dinodes i_last_eb_blk */
6584static int ocfs2_find_new_last_ext_blk(struct inode *inode,
6585 unsigned int clusters_to_del,
6586 struct ocfs2_path *path,
6587 struct buffer_head **new_last_eb)
6588{
6589 int next_free, ret = 0;
6590 u32 cpos;
6591 struct ocfs2_extent_rec *rec;
6592 struct ocfs2_extent_block *eb;
6593 struct ocfs2_extent_list *el;
6594 struct buffer_head *bh = NULL;
6595
6596 *new_last_eb = NULL;
6597
6598 /* we have no tree, so of course, no last_eb. */
6599 if (!path->p_tree_depth)
6600 goto out;
6601
6602 /* trunc to zero special case - this makes tree_depth = 0
6603 * regardless of what it is. */
6604 if (OCFS2_I(inode)->ip_clusters == clusters_to_del)
6605 goto out;
6606
6607 el = path_leaf_el(path);
6608 BUG_ON(!el->l_next_free_rec);
6609
6610 /*
6611 * Make sure that this extent list will actually be empty
6612 * after we clear away the data. We can shortcut out if
6613 * there's more than one non-empty extent in the
6614 * list. Otherwise, a check of the remaining extent is
6615 * necessary.
6616 */
6617 next_free = le16_to_cpu(el->l_next_free_rec);
6618 rec = NULL;
6619 if (ocfs2_is_empty_extent(&el->l_recs[0])) {
6620 if (next_free > 2)
6621 goto out;
6622
6623 /* We may have a valid extent in index 1, check it. */
6624 if (next_free == 2)
6625 rec = &el->l_recs[1];
6626
6627 /*
6628 * Fall through - no more nonempty extents, so we want
6629 * to delete this leaf.
6630 */
6631 } else {
6632 if (next_free > 1)
6633 goto out;
6634
6635 rec = &el->l_recs[0];
6636 }
6637
6638 if (rec) {
6639 /*
6640 * Check it we'll only be trimming off the end of this
6641 * cluster.
6642 */
6643 if (le16_to_cpu(rec->e_leaf_clusters) > clusters_to_del)
6644 goto out;
6645 }
6646
6647 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
6648 if (ret) {
6649 mlog_errno(ret);
6650 goto out;
6651 }
6652
6653 ret = ocfs2_find_leaf(INODE_CACHE(inode), path_root_el(path), cpos, &bh);
6654 if (ret) {
6655 mlog_errno(ret);
6656 goto out;
6657 }
6658
6659 eb = (struct ocfs2_extent_block *) bh->b_data;
6660 el = &eb->h_list;
6661
6662 /* ocfs2_find_leaf() gets the eb from ocfs2_read_extent_block().
6663 * Any corruption is a code bug. */
6664 BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
6665
6666 *new_last_eb = bh;
6667 get_bh(*new_last_eb);
6668 mlog(0, "returning block %llu, (cpos: %u)\n",
6669 (unsigned long long)le64_to_cpu(eb->h_blkno), cpos);
6670out:
6671 brelse(bh);
6672
6673 return ret;
6674}
6675
6676/*
6677 * Trim some clusters off the rightmost edge of a tree. Only called
6678 * during truncate.
6679 *
6680 * The caller needs to:
6681 * - start journaling of each path component.
6682 * - compute and fully set up any new last ext block
6683 */
6684static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path,
6685 handle_t *handle, struct ocfs2_truncate_context *tc,
6686 u32 clusters_to_del, u64 *delete_start, u8 *flags)
6687{
6688 int ret, i, index = path->p_tree_depth;
6689 u32 new_edge = 0;
6690 u64 deleted_eb = 0;
6691 struct buffer_head *bh;
6692 struct ocfs2_extent_list *el;
6693 struct ocfs2_extent_rec *rec;
6694
6695 *delete_start = 0;
6696 *flags = 0;
6697
6698 while (index >= 0) {
6699 bh = path->p_node[index].bh;
6700 el = path->p_node[index].el;
6701
6702 mlog(0, "traveling tree (index = %d, block = %llu)\n",
6703 index, (unsigned long long)bh->b_blocknr);
6704
6705 BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
6706
6707 if (index !=
6708 (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) {
6709 ocfs2_error(inode->i_sb,
6710 "Inode %lu has invalid ext. block %llu",
6711 inode->i_ino,
6712 (unsigned long long)bh->b_blocknr);
6713 ret = -EROFS;
6714 goto out;
6715 }
6716
6717find_tail_record:
6718 i = le16_to_cpu(el->l_next_free_rec) - 1;
6719 rec = &el->l_recs[i];
6720
6721 mlog(0, "Extent list before: record %d: (%u, %u, %llu), "
6722 "next = %u\n", i, le32_to_cpu(rec->e_cpos),
6723 ocfs2_rec_clusters(el, rec),
6724 (unsigned long long)le64_to_cpu(rec->e_blkno),
6725 le16_to_cpu(el->l_next_free_rec));
6726
6727 BUG_ON(ocfs2_rec_clusters(el, rec) < clusters_to_del);
6728
6729 if (le16_to_cpu(el->l_tree_depth) == 0) {
6730 /*
6731 * If the leaf block contains a single empty
6732 * extent and no records, we can just remove
6733 * the block.
6734 */
6735 if (i == 0 && ocfs2_is_empty_extent(rec)) {
6736 memset(rec, 0,
6737 sizeof(struct ocfs2_extent_rec));
6738 el->l_next_free_rec = cpu_to_le16(0);
6739
6740 goto delete;
6741 }
6742
6743 /*
6744 * Remove any empty extents by shifting things
6745 * left. That should make life much easier on
6746 * the code below. This condition is rare
6747 * enough that we shouldn't see a performance
6748 * hit.
6749 */
6750 if (ocfs2_is_empty_extent(&el->l_recs[0])) {
6751 le16_add_cpu(&el->l_next_free_rec, -1);
6752
6753 for(i = 0;
6754 i < le16_to_cpu(el->l_next_free_rec); i++)
6755 el->l_recs[i] = el->l_recs[i + 1];
6756
6757 memset(&el->l_recs[i], 0,
6758 sizeof(struct ocfs2_extent_rec));
6759
6760 /*
6761 * We've modified our extent list. The
6762 * simplest way to handle this change
6763 * is to being the search from the
6764 * start again.
6765 */
6766 goto find_tail_record;
6767 }
6768
6769 le16_add_cpu(&rec->e_leaf_clusters, -clusters_to_del);
6770
6771 /*
6772 * We'll use "new_edge" on our way back up the
6773 * tree to know what our rightmost cpos is.
6774 */
6775 new_edge = le16_to_cpu(rec->e_leaf_clusters);
6776 new_edge += le32_to_cpu(rec->e_cpos);
6777
6778 /*
6779 * The caller will use this to delete data blocks.
6780 */
6781 *delete_start = le64_to_cpu(rec->e_blkno)
6782 + ocfs2_clusters_to_blocks(inode->i_sb,
6783 le16_to_cpu(rec->e_leaf_clusters));
6784 *flags = rec->e_flags;
6785
6786 /*
6787 * If it's now empty, remove this record.
6788 */
6789 if (le16_to_cpu(rec->e_leaf_clusters) == 0) {
6790 memset(rec, 0,
6791 sizeof(struct ocfs2_extent_rec));
6792 le16_add_cpu(&el->l_next_free_rec, -1);
6793 }
6794 } else {
6795 if (le64_to_cpu(rec->e_blkno) == deleted_eb) {
6796 memset(rec, 0,
6797 sizeof(struct ocfs2_extent_rec));
6798 le16_add_cpu(&el->l_next_free_rec, -1);
6799
6800 goto delete;
6801 }
6802
6803 /* Can this actually happen? */
6804 if (le16_to_cpu(el->l_next_free_rec) == 0)
6805 goto delete;
6806
6807 /*
6808 * We never actually deleted any clusters
6809 * because our leaf was empty. There's no
6810 * reason to adjust the rightmost edge then.
6811 */
6812 if (new_edge == 0)
6813 goto delete;
6814
6815 rec->e_int_clusters = cpu_to_le32(new_edge);
6816 le32_add_cpu(&rec->e_int_clusters,
6817 -le32_to_cpu(rec->e_cpos));
6818
6819 /*
6820 * A deleted child record should have been
6821 * caught above.
6822 */
6823 BUG_ON(le32_to_cpu(rec->e_int_clusters) == 0);
6824 }
6825
6826delete:
6827 ret = ocfs2_journal_dirty(handle, bh);
6828 if (ret) {
6829 mlog_errno(ret);
6830 goto out;
6831 }
6832
6833 mlog(0, "extent list container %llu, after: record %d: "
6834 "(%u, %u, %llu), next = %u.\n",
6835 (unsigned long long)bh->b_blocknr, i,
6836 le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec),
6837 (unsigned long long)le64_to_cpu(rec->e_blkno),
6838 le16_to_cpu(el->l_next_free_rec));
6839
6840 /*
6841 * We must be careful to only attempt delete of an
6842 * extent block (and not the root inode block).
6843 */
6844 if (index > 0 && le16_to_cpu(el->l_next_free_rec) == 0) {
6845 struct ocfs2_extent_block *eb =
6846 (struct ocfs2_extent_block *)bh->b_data;
6847
6848 /*
6849 * Save this for use when processing the
6850 * parent block.
6851 */
6852 deleted_eb = le64_to_cpu(eb->h_blkno);
6853
6854 mlog(0, "deleting this extent block.\n");
6855
6856 ocfs2_remove_from_cache(INODE_CACHE(inode), bh);
6857
6858 BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0]));
6859 BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
6860 BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno));
6861
6862 ret = ocfs2_cache_extent_block_free(&tc->tc_dealloc, eb);
6863 /* An error here is not fatal. */
6864 if (ret < 0)
6865 mlog_errno(ret);
6866 } else {
6867 deleted_eb = 0;
6868 }
6869
6870 index--;
6871 }
6872
6873 ret = 0;
6874out:
6875 return ret;
6876}
6877
6878static int ocfs2_do_truncate(struct ocfs2_super *osb,
6879 unsigned int clusters_to_del,
6880 struct inode *inode,
6881 struct buffer_head *fe_bh,
6882 handle_t *handle,
6883 struct ocfs2_truncate_context *tc,
6884 struct ocfs2_path *path,
6885 struct ocfs2_alloc_context *meta_ac)
6886{
6887 int status;
6888 struct ocfs2_dinode *fe;
6889 struct ocfs2_extent_block *last_eb = NULL;
6890 struct ocfs2_extent_list *el;
6891 struct buffer_head *last_eb_bh = NULL;
6892 u64 delete_blk = 0;
6893 u8 rec_flags;
6894
6895 fe = (struct ocfs2_dinode *) fe_bh->b_data;
6896
6897 status = ocfs2_find_new_last_ext_blk(inode, clusters_to_del,
6898 path, &last_eb_bh);
6899 if (status < 0) {
6900 mlog_errno(status);
6901 goto bail;
6902 }
6903
6904 /*
6905 * Each component will be touched, so we might as well journal
6906 * here to avoid having to handle errors later.
6907 */
6908 status = ocfs2_journal_access_path(INODE_CACHE(inode), handle, path);
6909 if (status < 0) {
6910 mlog_errno(status);
6911 goto bail;
6912 }
6913
6914 if (last_eb_bh) {
6915 status = ocfs2_journal_access_eb(handle, INODE_CACHE(inode), last_eb_bh,
6916 OCFS2_JOURNAL_ACCESS_WRITE);
6917 if (status < 0) {
6918 mlog_errno(status);
6919 goto bail;
6920 }
6921
6922 last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
6923 }
6924
6925 el = &(fe->id2.i_list);
6926
6927 /*
6928 * Lower levels depend on this never happening, but it's best
6929 * to check it up here before changing the tree.
6930 */
6931 if (el->l_tree_depth && el->l_recs[0].e_int_clusters == 0) {
6932 ocfs2_error(inode->i_sb,
6933 "Inode %lu has an empty extent record, depth %u\n",
6934 inode->i_ino, le16_to_cpu(el->l_tree_depth));
6935 status = -EROFS;
6936 goto bail;
6937 }
6938
6939 dquot_free_space_nodirty(inode,
6940 ocfs2_clusters_to_bytes(osb->sb, clusters_to_del));
6941 spin_lock(&OCFS2_I(inode)->ip_lock);
6942 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
6943 clusters_to_del;
6944 spin_unlock(&OCFS2_I(inode)->ip_lock);
6945 le32_add_cpu(&fe->i_clusters, -clusters_to_del);
6946 inode->i_blocks = ocfs2_inode_sector_count(inode);
6947
6948 status = ocfs2_trim_tree(inode, path, handle, tc,
6949 clusters_to_del, &delete_blk, &rec_flags);
6950 if (status) {
6951 mlog_errno(status);
6952 goto bail;
6953 }
6954
6955 if (le32_to_cpu(fe->i_clusters) == 0) {
6956 /* trunc to zero is a special case. */
6957 el->l_tree_depth = 0;
6958 fe->i_last_eb_blk = 0;
6959 } else if (last_eb)
6960 fe->i_last_eb_blk = last_eb->h_blkno;
6961
6962 status = ocfs2_journal_dirty(handle, fe_bh);
6963 if (status < 0) {
6964 mlog_errno(status);
6965 goto bail;
6966 }
6967
6968 if (last_eb) {
6969 /* If there will be a new last extent block, then by
6970 * definition, there cannot be any leaves to the right of
6971 * him. */
6972 last_eb->h_next_leaf_blk = 0;
6973 status = ocfs2_journal_dirty(handle, last_eb_bh);
6974 if (status < 0) {
6975 mlog_errno(status);
6976 goto bail;
6977 }
6978 }
6979
6980 if (delete_blk) {
6981 if (rec_flags & OCFS2_EXT_REFCOUNTED)
6982 status = ocfs2_decrease_refcount(inode, handle,
6983 ocfs2_blocks_to_clusters(osb->sb,
6984 delete_blk),
6985 clusters_to_del, meta_ac,
6986 &tc->tc_dealloc, 1);
6987 else
6988 status = ocfs2_truncate_log_append(osb, handle,
6989 delete_blk,
6990 clusters_to_del);
6991 if (status < 0) {
6992 mlog_errno(status);
6993 goto bail;
6994 }
6995 }
6996 status = 0;
6997bail:
6998 brelse(last_eb_bh);
6999 mlog_exit(status);
7000 return status;
7001}
7002
7003static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh) 6584static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh)
7004{ 6585{
7005 set_buffer_uptodate(bh); 6586 set_buffer_uptodate(bh);
@@ -7307,7 +6888,9 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
7307 goto out_commit; 6888 goto out_commit;
7308 did_quota = 1; 6889 did_quota = 1;
7309 6890
7310 ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, 6891 data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
6892
6893 ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off,
7311 &num); 6894 &num);
7312 if (ret) { 6895 if (ret) {
7313 mlog_errno(ret); 6896 mlog_errno(ret);
@@ -7406,26 +6989,29 @@ out:
7406 */ 6989 */
7407int ocfs2_commit_truncate(struct ocfs2_super *osb, 6990int ocfs2_commit_truncate(struct ocfs2_super *osb,
7408 struct inode *inode, 6991 struct inode *inode,
7409 struct buffer_head *fe_bh, 6992 struct buffer_head *di_bh)
7410 struct ocfs2_truncate_context *tc)
7411{ 6993{
7412 int status, i, credits, tl_sem = 0; 6994 int status = 0, i, flags = 0;
7413 u32 clusters_to_del, new_highest_cpos, range; 6995 u32 new_highest_cpos, range, trunc_cpos, trunc_len, phys_cpos, coff;
7414 u64 blkno = 0; 6996 u64 blkno = 0;
7415 struct ocfs2_extent_list *el; 6997 struct ocfs2_extent_list *el;
7416 handle_t *handle = NULL; 6998 struct ocfs2_extent_rec *rec;
7417 struct inode *tl_inode = osb->osb_tl_inode;
7418 struct ocfs2_path *path = NULL; 6999 struct ocfs2_path *path = NULL;
7419 struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data; 7000 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
7420 struct ocfs2_alloc_context *meta_ac = NULL; 7001 struct ocfs2_extent_list *root_el = &(di->id2.i_list);
7421 struct ocfs2_refcount_tree *ref_tree = NULL; 7002 u64 refcount_loc = le64_to_cpu(di->i_refcount_loc);
7003 struct ocfs2_extent_tree et;
7004 struct ocfs2_cached_dealloc_ctxt dealloc;
7422 7005
7423 mlog_entry_void(); 7006 mlog_entry_void();
7424 7007
7008 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
7009 ocfs2_init_dealloc_ctxt(&dealloc);
7010
7425 new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb, 7011 new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
7426 i_size_read(inode)); 7012 i_size_read(inode));
7427 7013
7428 path = ocfs2_new_path(fe_bh, &di->id2.i_list, 7014 path = ocfs2_new_path(di_bh, &di->id2.i_list,
7429 ocfs2_journal_access_di); 7015 ocfs2_journal_access_di);
7430 if (!path) { 7016 if (!path) {
7431 status = -ENOMEM; 7017 status = -ENOMEM;
@@ -7444,8 +7030,6 @@ start:
7444 goto bail; 7030 goto bail;
7445 } 7031 }
7446 7032
7447 credits = 0;
7448
7449 /* 7033 /*
7450 * Truncate always works against the rightmost tree branch. 7034 * Truncate always works against the rightmost tree branch.
7451 */ 7035 */
@@ -7480,101 +7064,62 @@ start:
7480 } 7064 }
7481 7065
7482 i = le16_to_cpu(el->l_next_free_rec) - 1; 7066 i = le16_to_cpu(el->l_next_free_rec) - 1;
7483 range = le32_to_cpu(el->l_recs[i].e_cpos) + 7067 rec = &el->l_recs[i];
7484 ocfs2_rec_clusters(el, &el->l_recs[i]); 7068 flags = rec->e_flags;
7485 if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) { 7069 range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
7486 clusters_to_del = 0; 7070
7487 } else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) { 7071 if (i == 0 && ocfs2_is_empty_extent(rec)) {
7488 clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]); 7072 /*
7489 blkno = le64_to_cpu(el->l_recs[i].e_blkno); 7073 * Lower levels depend on this never happening, but it's best
7074 * to check it up here before changing the tree.
7075 */
7076 if (root_el->l_tree_depth && rec->e_int_clusters == 0) {
7077 ocfs2_error(inode->i_sb, "Inode %lu has an empty "
7078 "extent record, depth %u\n", inode->i_ino,
7079 le16_to_cpu(root_el->l_tree_depth));
7080 status = -EROFS;
7081 goto bail;
7082 }
7083 trunc_cpos = le32_to_cpu(rec->e_cpos);
7084 trunc_len = 0;
7085 blkno = 0;
7086 } else if (le32_to_cpu(rec->e_cpos) >= new_highest_cpos) {
7087 /*
7088 * Truncate entire record.
7089 */
7090 trunc_cpos = le32_to_cpu(rec->e_cpos);
7091 trunc_len = ocfs2_rec_clusters(el, rec);
7092 blkno = le64_to_cpu(rec->e_blkno);
7490 } else if (range > new_highest_cpos) { 7093 } else if (range > new_highest_cpos) {
7491 clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) + 7094 /*
7492 le32_to_cpu(el->l_recs[i].e_cpos)) - 7095 * Partial truncate. it also should be
7493 new_highest_cpos; 7096 * the last truncate we're doing.
7494 blkno = le64_to_cpu(el->l_recs[i].e_blkno) + 7097 */
7495 ocfs2_clusters_to_blocks(inode->i_sb, 7098 trunc_cpos = new_highest_cpos;
7496 ocfs2_rec_clusters(el, &el->l_recs[i]) - 7099 trunc_len = range - new_highest_cpos;
7497 clusters_to_del); 7100 coff = new_highest_cpos - le32_to_cpu(rec->e_cpos);
7101 blkno = le64_to_cpu(rec->e_blkno) +
7102 ocfs2_clusters_to_blocks(inode->i_sb, coff);
7498 } else { 7103 } else {
7104 /*
7105 * Truncate completed, leave happily.
7106 */
7499 status = 0; 7107 status = 0;
7500 goto bail; 7108 goto bail;
7501 } 7109 }
7502 7110
7503 mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n", 7111 phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
7504 clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr);
7505
7506 if (el->l_recs[i].e_flags & OCFS2_EXT_REFCOUNTED && clusters_to_del) {
7507 BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
7508 OCFS2_HAS_REFCOUNT_FL));
7509
7510 status = ocfs2_lock_refcount_tree(osb,
7511 le64_to_cpu(di->i_refcount_loc),
7512 1, &ref_tree, NULL);
7513 if (status) {
7514 mlog_errno(status);
7515 goto bail;
7516 }
7517
7518 status = ocfs2_prepare_refcount_change_for_del(inode, fe_bh,
7519 blkno,
7520 clusters_to_del,
7521 &credits,
7522 &meta_ac);
7523 if (status < 0) {
7524 mlog_errno(status);
7525 goto bail;
7526 }
7527 }
7528
7529 mutex_lock(&tl_inode->i_mutex);
7530 tl_sem = 1;
7531 /* ocfs2_truncate_log_needs_flush guarantees us at least one
7532 * record is free for use. If there isn't any, we flush to get
7533 * an empty truncate log. */
7534 if (ocfs2_truncate_log_needs_flush(osb)) {
7535 status = __ocfs2_flush_truncate_log(osb);
7536 if (status < 0) {
7537 mlog_errno(status);
7538 goto bail;
7539 }
7540 }
7541 7112
7542 credits += ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del, 7113 status = ocfs2_remove_btree_range(inode, &et, trunc_cpos,
7543 (struct ocfs2_dinode *)fe_bh->b_data, 7114 phys_cpos, trunc_len, flags, &dealloc,
7544 el); 7115 refcount_loc);
7545 handle = ocfs2_start_trans(osb, credits);
7546 if (IS_ERR(handle)) {
7547 status = PTR_ERR(handle);
7548 handle = NULL;
7549 mlog_errno(status);
7550 goto bail;
7551 }
7552
7553 status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle,
7554 tc, path, meta_ac);
7555 if (status < 0) { 7116 if (status < 0) {
7556 mlog_errno(status); 7117 mlog_errno(status);
7557 goto bail; 7118 goto bail;
7558 } 7119 }
7559 7120
7560 mutex_unlock(&tl_inode->i_mutex);
7561 tl_sem = 0;
7562
7563 ocfs2_commit_trans(osb, handle);
7564 handle = NULL;
7565
7566 ocfs2_reinit_path(path, 1); 7121 ocfs2_reinit_path(path, 1);
7567 7122
7568 if (meta_ac) {
7569 ocfs2_free_alloc_context(meta_ac);
7570 meta_ac = NULL;
7571 }
7572
7573 if (ref_tree) {
7574 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
7575 ref_tree = NULL;
7576 }
7577
7578 /* 7123 /*
7579 * The check above will catch the case where we've truncated 7124 * The check above will catch the case where we've truncated
7580 * away all allocation. 7125 * away all allocation.
@@ -7585,25 +7130,10 @@ bail:
7585 7130
7586 ocfs2_schedule_truncate_log_flush(osb, 1); 7131 ocfs2_schedule_truncate_log_flush(osb, 1);
7587 7132
7588 if (tl_sem) 7133 ocfs2_run_deallocs(osb, &dealloc);
7589 mutex_unlock(&tl_inode->i_mutex);
7590
7591 if (handle)
7592 ocfs2_commit_trans(osb, handle);
7593
7594 if (meta_ac)
7595 ocfs2_free_alloc_context(meta_ac);
7596
7597 if (ref_tree)
7598 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
7599
7600 ocfs2_run_deallocs(osb, &tc->tc_dealloc);
7601 7134
7602 ocfs2_free_path(path); 7135 ocfs2_free_path(path);
7603 7136
7604 /* This will drop the ext_alloc cluster lock for us */
7605 ocfs2_free_truncate_context(tc);
7606
7607 mlog_exit(status); 7137 mlog_exit(status);
7608 return status; 7138 return status;
7609} 7139}
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 1db4359ccb90..55762b554b99 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -140,8 +140,9 @@ int ocfs2_remove_extent(handle_t *handle, struct ocfs2_extent_tree *et,
140 struct ocfs2_cached_dealloc_ctxt *dealloc); 140 struct ocfs2_cached_dealloc_ctxt *dealloc);
141int ocfs2_remove_btree_range(struct inode *inode, 141int ocfs2_remove_btree_range(struct inode *inode,
142 struct ocfs2_extent_tree *et, 142 struct ocfs2_extent_tree *et,
143 u32 cpos, u32 phys_cpos, u32 len, 143 u32 cpos, u32 phys_cpos, u32 len, int flags,
144 struct ocfs2_cached_dealloc_ctxt *dealloc); 144 struct ocfs2_cached_dealloc_ctxt *dealloc,
145 u64 refcount_loc);
145 146
146int ocfs2_num_free_extents(struct ocfs2_super *osb, 147int ocfs2_num_free_extents(struct ocfs2_super *osb,
147 struct ocfs2_extent_tree *et); 148 struct ocfs2_extent_tree *et);
@@ -209,7 +210,7 @@ static inline void ocfs2_init_dealloc_ctxt(struct ocfs2_cached_dealloc_ctxt *c)
209int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, 210int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
210 u64 blkno, unsigned int bit); 211 u64 blkno, unsigned int bit);
211int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, 212int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
212 int type, int slot, u64 blkno, 213 int type, int slot, u64 suballoc, u64 blkno,
213 unsigned int bit); 214 unsigned int bit);
214static inline int ocfs2_dealloc_has_cluster(struct ocfs2_cached_dealloc_ctxt *c) 215static inline int ocfs2_dealloc_has_cluster(struct ocfs2_cached_dealloc_ctxt *c)
215{ 216{
@@ -233,8 +234,7 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
233 struct ocfs2_truncate_context **tc); 234 struct ocfs2_truncate_context **tc);
234int ocfs2_commit_truncate(struct ocfs2_super *osb, 235int ocfs2_commit_truncate(struct ocfs2_super *osb,
235 struct inode *inode, 236 struct inode *inode,
236 struct buffer_head *fe_bh, 237 struct buffer_head *di_bh);
237 struct ocfs2_truncate_context *tc);
238int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh, 238int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
239 unsigned int start, unsigned int end, int trunc); 239 unsigned int start, unsigned int end, int trunc);
240 240
@@ -319,6 +319,8 @@ int ocfs2_journal_access_path(struct ocfs2_caching_info *ci,
319 struct ocfs2_path *path); 319 struct ocfs2_path *path);
320int ocfs2_find_cpos_for_right_leaf(struct super_block *sb, 320int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
321 struct ocfs2_path *path, u32 *cpos); 321 struct ocfs2_path *path, u32 *cpos);
322int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
323 struct ocfs2_path *path, u32 *cpos);
322int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et, 324int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et,
323 struct ocfs2_path *left, 325 struct ocfs2_path *left,
324 struct ocfs2_path *right); 326 struct ocfs2_path *right);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 21441ddb5506..3623ca20cc18 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1735,6 +1735,9 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
1735 goto out; 1735 goto out;
1736 } 1736 }
1737 1737
1738 if (data_ac)
1739 data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
1740
1738 credits = ocfs2_calc_extend_credits(inode->i_sb, 1741 credits = ocfs2_calc_extend_credits(inode->i_sb,
1739 &di->id2.i_list, 1742 &di->id2.i_list,
1740 clusters_to_alloc); 1743 clusters_to_alloc);
diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c
index b7428c5d0d3b..ec6d12339593 100644
--- a/fs/ocfs2/blockcheck.c
+++ b/fs/ocfs2/blockcheck.c
@@ -403,7 +403,7 @@ void ocfs2_block_check_compute(void *data, size_t blocksize,
403 * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no 403 * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no
404 * larger than 16 bits. 404 * larger than 16 bits.
405 */ 405 */
406 BUG_ON(ecc > USHORT_MAX); 406 BUG_ON(ecc > USHRT_MAX);
407 407
408 bc->bc_crc32e = cpu_to_le32(crc); 408 bc->bc_crc32e = cpu_to_le32(crc);
409 bc->bc_ecc = cpu_to_le16((u16)ecc); 409 bc->bc_ecc = cpu_to_le16((u16)ecc);
@@ -508,7 +508,7 @@ void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr,
508 * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no 508 * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no
509 * larger than 16 bits. 509 * larger than 16 bits.
510 */ 510 */
511 BUG_ON(ecc > USHORT_MAX); 511 BUG_ON(ecc > USHRT_MAX);
512 512
513 bc->bc_crc32e = cpu_to_le32(crc); 513 bc->bc_crc32e = cpu_to_le32(crc);
514 bc->bc_ecc = cpu_to_le16((u16)ecc); 514 bc->bc_ecc = cpu_to_le16((u16)ecc);
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index 21c808f752d8..f9d5d3ffc75a 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -25,7 +25,6 @@
25 25
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/types.h> 27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h> 28#include <linux/highmem.h>
30 29
31#include <cluster/masklog.h> 30#include <cluster/masklog.h>
@@ -407,6 +406,7 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
407 struct buffer_head *bh) 406 struct buffer_head *bh)
408{ 407{
409 int ret = 0; 408 int ret = 0;
409 struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
410 410
411 mlog_entry_void(); 411 mlog_entry_void();
412 412
@@ -426,6 +426,7 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
426 426
427 get_bh(bh); /* for end_buffer_write_sync() */ 427 get_bh(bh); /* for end_buffer_write_sync() */
428 bh->b_end_io = end_buffer_write_sync; 428 bh->b_end_io = end_buffer_write_sync;
429 ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &di->i_check);
429 submit_bh(WRITE, bh); 430 submit_bh(WRITE, bh);
430 431
431 wait_on_buffer(bh); 432 wait_on_buffer(bh);
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 5c9890006708..41d5f1f92d56 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -34,6 +34,7 @@
34#include <linux/crc32.h> 34#include <linux/crc32.h>
35#include <linux/time.h> 35#include <linux/time.h>
36#include <linux/debugfs.h> 36#include <linux/debugfs.h>
37#include <linux/slab.h>
37 38
38#include "heartbeat.h" 39#include "heartbeat.h"
39#include "tcp.h" 40#include "tcp.h"
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index 3bb928a2bf7d..c7fba396392d 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -116,6 +116,7 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
116 define_mask(ERROR), 116 define_mask(ERROR),
117 define_mask(NOTICE), 117 define_mask(NOTICE),
118 define_mask(KTHREAD), 118 define_mask(KTHREAD),
119 define_mask(RESERVATIONS),
119}; 120};
120 121
121static struct attribute *mlog_attr_ptrs[MLOG_MAX_BITS] = {NULL, }; 122static struct attribute *mlog_attr_ptrs[MLOG_MAX_BITS] = {NULL, };
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 3dfddbec32f2..fd96e2a2fa56 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -119,6 +119,7 @@
119#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */ 119#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */
120#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */ 120#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */
121#define ML_KTHREAD 0x0000000400000000ULL /* kernel thread activity */ 121#define ML_KTHREAD 0x0000000400000000ULL /* kernel thread activity */
122#define ML_RESERVATIONS 0x0000000800000000ULL /* ocfs2 alloc reservations */
122 123
123#define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE) 124#define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE)
124#define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT) 125#define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT)
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index c81142e3ef84..ed0c9f367fed 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -19,6 +19,7 @@
19 * Boston, MA 021110-1307, USA. 19 * Boston, MA 021110-1307, USA.
20 */ 20 */
21 21
22#include <linux/slab.h>
22#include <linux/kernel.h> 23#include <linux/kernel.h>
23#include <linux/module.h> 24#include <linux/module.h>
24#include <linux/configfs.h> 25#include <linux/configfs.h>
diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c
index 639024033fce..cf3e16696216 100644
--- a/fs/ocfs2/cluster/quorum.c
+++ b/fs/ocfs2/cluster/quorum.c
@@ -44,7 +44,6 @@
44 * and if they're the last, they fire off the decision. 44 * and if they're the last, they fire off the decision.
45 */ 45 */
46#include <linux/kernel.h> 46#include <linux/kernel.h>
47#include <linux/slab.h>
48#include <linux/workqueue.h> 47#include <linux/workqueue.h>
49#include <linux/reboot.h> 48#include <linux/reboot.h>
50 49
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 73e743eea2c8..aa75ca3f78da 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -583,6 +583,9 @@ static void o2net_state_change(struct sock *sk)
583 o2net_sc_queue_work(sc, &sc->sc_connect_work); 583 o2net_sc_queue_work(sc, &sc->sc_connect_work);
584 break; 584 break;
585 default: 585 default:
586 printk(KERN_INFO "o2net: connection to " SC_NODEF_FMT
587 " shutdown, state %d\n",
588 SC_NODEF_ARGS(sc), sk->sk_state);
586 o2net_sc_queue_work(sc, &sc->sc_shutdown_work); 589 o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
587 break; 590 break;
588 } 591 }
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index efd77d071c80..f04ebcfffc4a 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -1194,7 +1194,7 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
1194 else 1194 else
1195 de->inode = 0; 1195 de->inode = 0;
1196 dir->i_version++; 1196 dir->i_version++;
1197 status = ocfs2_journal_dirty(handle, bh); 1197 ocfs2_journal_dirty(handle, bh);
1198 goto bail; 1198 goto bail;
1199 } 1199 }
1200 i += le16_to_cpu(de->rec_len); 1200 i += le16_to_cpu(de->rec_len);
@@ -1752,7 +1752,7 @@ int __ocfs2_add_entry(handle_t *handle,
1752 ocfs2_recalc_free_list(dir, handle, lookup); 1752 ocfs2_recalc_free_list(dir, handle, lookup);
1753 1753
1754 dir->i_version++; 1754 dir->i_version++;
1755 status = ocfs2_journal_dirty(handle, insert_bh); 1755 ocfs2_journal_dirty(handle, insert_bh);
1756 retval = 0; 1756 retval = 0;
1757 goto bail; 1757 goto bail;
1758 } 1758 }
@@ -2297,12 +2297,7 @@ static int ocfs2_fill_new_dir_id(struct ocfs2_super *osb,
2297 } 2297 }
2298 2298
2299 ocfs2_fill_initial_dirents(inode, parent, data->id_data, size); 2299 ocfs2_fill_initial_dirents(inode, parent, data->id_data, size);
2300
2301 ocfs2_journal_dirty(handle, di_bh); 2300 ocfs2_journal_dirty(handle, di_bh);
2302 if (ret) {
2303 mlog_errno(ret);
2304 goto out;
2305 }
2306 2301
2307 i_size_write(inode, size); 2302 i_size_write(inode, size);
2308 inode->i_nlink = 2; 2303 inode->i_nlink = 2;
@@ -2366,11 +2361,7 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
2366 ocfs2_init_dir_trailer(inode, new_bh, size); 2361 ocfs2_init_dir_trailer(inode, new_bh, size);
2367 } 2362 }
2368 2363
2369 status = ocfs2_journal_dirty(handle, new_bh); 2364 ocfs2_journal_dirty(handle, new_bh);
2370 if (status < 0) {
2371 mlog_errno(status);
2372 goto bail;
2373 }
2374 2365
2375 i_size_write(inode, inode->i_sb->s_blocksize); 2366 i_size_write(inode, inode->i_sb->s_blocksize);
2376 inode->i_nlink = 2; 2367 inode->i_nlink = 2;
@@ -2404,15 +2395,15 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
2404 int ret; 2395 int ret;
2405 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; 2396 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
2406 u16 dr_suballoc_bit; 2397 u16 dr_suballoc_bit;
2407 u64 dr_blkno; 2398 u64 suballoc_loc, dr_blkno;
2408 unsigned int num_bits; 2399 unsigned int num_bits;
2409 struct buffer_head *dx_root_bh = NULL; 2400 struct buffer_head *dx_root_bh = NULL;
2410 struct ocfs2_dx_root_block *dx_root; 2401 struct ocfs2_dx_root_block *dx_root;
2411 struct ocfs2_dir_block_trailer *trailer = 2402 struct ocfs2_dir_block_trailer *trailer =
2412 ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb); 2403 ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb);
2413 2404
2414 ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1, &dr_suballoc_bit, 2405 ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
2415 &num_bits, &dr_blkno); 2406 &dr_suballoc_bit, &num_bits, &dr_blkno);
2416 if (ret) { 2407 if (ret) {
2417 mlog_errno(ret); 2408 mlog_errno(ret);
2418 goto out; 2409 goto out;
@@ -2440,6 +2431,7 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
2440 memset(dx_root, 0, osb->sb->s_blocksize); 2431 memset(dx_root, 0, osb->sb->s_blocksize);
2441 strcpy(dx_root->dr_signature, OCFS2_DX_ROOT_SIGNATURE); 2432 strcpy(dx_root->dr_signature, OCFS2_DX_ROOT_SIGNATURE);
2442 dx_root->dr_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot); 2433 dx_root->dr_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
2434 dx_root->dr_suballoc_loc = cpu_to_le64(suballoc_loc);
2443 dx_root->dr_suballoc_bit = cpu_to_le16(dr_suballoc_bit); 2435 dx_root->dr_suballoc_bit = cpu_to_le16(dr_suballoc_bit);
2444 dx_root->dr_fs_generation = cpu_to_le32(osb->fs_generation); 2436 dx_root->dr_fs_generation = cpu_to_le32(osb->fs_generation);
2445 dx_root->dr_blkno = cpu_to_le64(dr_blkno); 2437 dx_root->dr_blkno = cpu_to_le64(dr_blkno);
@@ -2458,10 +2450,7 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
2458 dx_root->dr_list.l_count = 2450 dx_root->dr_list.l_count =
2459 cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb)); 2451 cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb));
2460 } 2452 }
2461 2453 ocfs2_journal_dirty(handle, dx_root_bh);
2462 ret = ocfs2_journal_dirty(handle, dx_root_bh);
2463 if (ret)
2464 mlog_errno(ret);
2465 2454
2466 ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh, 2455 ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh,
2467 OCFS2_JOURNAL_ACCESS_CREATE); 2456 OCFS2_JOURNAL_ACCESS_CREATE);
@@ -2475,9 +2464,7 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
2475 OCFS2_I(dir)->ip_dyn_features |= OCFS2_INDEXED_DIR_FL; 2464 OCFS2_I(dir)->ip_dyn_features |= OCFS2_INDEXED_DIR_FL;
2476 di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features); 2465 di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
2477 2466
2478 ret = ocfs2_journal_dirty(handle, di_bh); 2467 ocfs2_journal_dirty(handle, di_bh);
2479 if (ret)
2480 mlog_errno(ret);
2481 2468
2482 *ret_dx_root_bh = dx_root_bh; 2469 *ret_dx_root_bh = dx_root_bh;
2483 dx_root_bh = NULL; 2470 dx_root_bh = NULL;
@@ -2558,7 +2545,7 @@ static int __ocfs2_dx_dir_new_cluster(struct inode *dir,
2558 * chance of contiguousness as the directory grows in number 2545 * chance of contiguousness as the directory grows in number
2559 * of entries. 2546 * of entries.
2560 */ 2547 */
2561 ret = __ocfs2_claim_clusters(osb, handle, data_ac, 1, 1, &phys, &num); 2548 ret = __ocfs2_claim_clusters(handle, data_ac, 1, 1, &phys, &num);
2562 if (ret) { 2549 if (ret) {
2563 mlog_errno(ret); 2550 mlog_errno(ret);
2564 goto out; 2551 goto out;
@@ -2991,7 +2978,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
2991 * if we only get one now, that's enough to continue. The rest 2978 * if we only get one now, that's enough to continue. The rest
2992 * will be claimed after the conversion to extents. 2979 * will be claimed after the conversion to extents.
2993 */ 2980 */
2994 ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, &len); 2981 if (ocfs2_dir_resv_allowed(osb))
2982 data_ac->ac_resv = &oi->ip_la_data_resv;
2983 ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off, &len);
2995 if (ret) { 2984 if (ret) {
2996 mlog_errno(ret); 2985 mlog_errno(ret);
2997 goto out_commit; 2986 goto out_commit;
@@ -3034,11 +3023,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
3034 ocfs2_init_dir_trailer(dir, dirdata_bh, i); 3023 ocfs2_init_dir_trailer(dir, dirdata_bh, i);
3035 } 3024 }
3036 3025
3037 ret = ocfs2_journal_dirty(handle, dirdata_bh); 3026 ocfs2_journal_dirty(handle, dirdata_bh);
3038 if (ret) {
3039 mlog_errno(ret);
3040 goto out_commit;
3041 }
3042 3027
3043 if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) { 3028 if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) {
3044 /* 3029 /*
@@ -3104,11 +3089,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
3104 */ 3089 */
3105 dir->i_blocks = ocfs2_inode_sector_count(dir); 3090 dir->i_blocks = ocfs2_inode_sector_count(dir);
3106 3091
3107 ret = ocfs2_journal_dirty(handle, di_bh); 3092 ocfs2_journal_dirty(handle, di_bh);
3108 if (ret) {
3109 mlog_errno(ret);
3110 goto out_commit;
3111 }
3112 3093
3113 if (ocfs2_supports_indexed_dirs(osb)) { 3094 if (ocfs2_supports_indexed_dirs(osb)) {
3114 ret = ocfs2_dx_dir_attach_index(osb, handle, dir, di_bh, 3095 ret = ocfs2_dx_dir_attach_index(osb, handle, dir, di_bh,
@@ -3138,7 +3119,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
3138 * pass. Claim the 2nd cluster as a separate extent. 3119 * pass. Claim the 2nd cluster as a separate extent.
3139 */ 3120 */
3140 if (alloc > len) { 3121 if (alloc > len) {
3141 ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, 3122 ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off,
3142 &len); 3123 &len);
3143 if (ret) { 3124 if (ret) {
3144 mlog_errno(ret); 3125 mlog_errno(ret);
@@ -3369,6 +3350,9 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
3369 goto bail; 3350 goto bail;
3370 } 3351 }
3371 3352
3353 if (ocfs2_dir_resv_allowed(osb))
3354 data_ac->ac_resv = &OCFS2_I(dir)->ip_la_data_resv;
3355
3372 credits = ocfs2_calc_extend_credits(sb, el, 1); 3356 credits = ocfs2_calc_extend_credits(sb, el, 1);
3373 } else { 3357 } else {
3374 spin_unlock(&OCFS2_I(dir)->ip_lock); 3358 spin_unlock(&OCFS2_I(dir)->ip_lock);
@@ -3423,11 +3407,7 @@ do_extend:
3423 } else { 3407 } else {
3424 de->rec_len = cpu_to_le16(sb->s_blocksize); 3408 de->rec_len = cpu_to_le16(sb->s_blocksize);
3425 } 3409 }
3426 status = ocfs2_journal_dirty(handle, new_bh); 3410 ocfs2_journal_dirty(handle, new_bh);
3427 if (status < 0) {
3428 mlog_errno(status);
3429 goto bail;
3430 }
3431 3411
3432 dir_i_size += dir->i_sb->s_blocksize; 3412 dir_i_size += dir->i_sb->s_blocksize;
3433 i_size_write(dir, dir_i_size); 3413 i_size_write(dir, dir_i_size);
@@ -3906,11 +3886,7 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
3906 sizeof(struct ocfs2_dx_entry), dx_leaf_sort_cmp, 3886 sizeof(struct ocfs2_dx_entry), dx_leaf_sort_cmp,
3907 dx_leaf_sort_swap); 3887 dx_leaf_sort_swap);
3908 3888
3909 ret = ocfs2_journal_dirty(handle, dx_leaf_bh); 3889 ocfs2_journal_dirty(handle, dx_leaf_bh);
3910 if (ret) {
3911 mlog_errno(ret);
3912 goto out_commit;
3913 }
3914 3890
3915 ret = ocfs2_dx_dir_find_leaf_split(dx_leaf, leaf_cpos, insert_hash, 3891 ret = ocfs2_dx_dir_find_leaf_split(dx_leaf, leaf_cpos, insert_hash,
3916 &split_hash); 3892 &split_hash);
@@ -4490,7 +4466,10 @@ static int ocfs2_dx_dir_remove_index(struct inode *dir,
4490 4466
4491 blk = le64_to_cpu(dx_root->dr_blkno); 4467 blk = le64_to_cpu(dx_root->dr_blkno);
4492 bit = le16_to_cpu(dx_root->dr_suballoc_bit); 4468 bit = le16_to_cpu(dx_root->dr_suballoc_bit);
4493 bg_blkno = ocfs2_which_suballoc_group(blk, bit); 4469 if (dx_root->dr_suballoc_loc)
4470 bg_blkno = le64_to_cpu(dx_root->dr_suballoc_loc);
4471 else
4472 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
4494 ret = ocfs2_free_suballoc_bits(handle, dx_alloc_inode, dx_alloc_bh, 4473 ret = ocfs2_free_suballoc_bits(handle, dx_alloc_inode, dx_alloc_bh,
4495 bit, bg_blkno, 1); 4474 bit, bg_blkno, 1);
4496 if (ret) 4475 if (ret)
@@ -4551,8 +4530,8 @@ int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh)
4551 4530
4552 p_cpos = ocfs2_blocks_to_clusters(dir->i_sb, blkno); 4531 p_cpos = ocfs2_blocks_to_clusters(dir->i_sb, blkno);
4553 4532
4554 ret = ocfs2_remove_btree_range(dir, &et, cpos, p_cpos, clen, 4533 ret = ocfs2_remove_btree_range(dir, &et, cpos, p_cpos, clen, 0,
4555 &dealloc); 4534 &dealloc, 0);
4556 if (ret) { 4535 if (ret) {
4557 mlog_errno(ret); 4536 mlog_errno(ret);
4558 goto out; 4537 goto out;
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index dccc439fa087..f44999156839 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -28,7 +28,6 @@
28#include <linux/module.h> 28#include <linux/module.h>
29#include <linux/fs.h> 29#include <linux/fs.h>
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/slab.h>
32#include <linux/highmem.h> 31#include <linux/highmem.h>
33#include <linux/init.h> 32#include <linux/init.h>
34#include <linux/sysctl.h> 33#include <linux/sysctl.h>
@@ -89,7 +88,7 @@ static int dlm_should_cancel_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
89 return 0; 88 return 0;
90} 89}
91 90
92static void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock) 91void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
93{ 92{
94 mlog_entry_void(); 93 mlog_entry_void();
95 94
@@ -146,7 +145,7 @@ void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
146} 145}
147 146
148 147
149static void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock) 148void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
150{ 149{
151 mlog_entry_void(); 150 mlog_entry_void();
152 151
@@ -185,9 +184,8 @@ static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
185 BUG_ON(!lksb); 184 BUG_ON(!lksb);
186 185
187 /* only updates if this node masters the lockres */ 186 /* only updates if this node masters the lockres */
187 spin_lock(&res->spinlock);
188 if (res->owner == dlm->node_num) { 188 if (res->owner == dlm->node_num) {
189
190 spin_lock(&res->spinlock);
191 /* check the lksb flags for the direction */ 189 /* check the lksb flags for the direction */
192 if (lksb->flags & DLM_LKSB_GET_LVB) { 190 if (lksb->flags & DLM_LKSB_GET_LVB) {
193 mlog(0, "getting lvb from lockres for %s node\n", 191 mlog(0, "getting lvb from lockres for %s node\n",
@@ -202,8 +200,8 @@ static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
202 * here. In the future we might want to clear it at the time 200 * here. In the future we might want to clear it at the time
203 * the put is actually done. 201 * the put is actually done.
204 */ 202 */
205 spin_unlock(&res->spinlock);
206 } 203 }
204 spin_unlock(&res->spinlock);
207 205
208 /* reset any lvb flags on the lksb */ 206 /* reset any lvb flags on the lksb */
209 lksb->flags &= ~(DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB); 207 lksb->flags &= ~(DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB);
@@ -453,7 +451,9 @@ int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
453 ret = o2net_send_message_vec(DLM_PROXY_AST_MSG, dlm->key, vec, veclen, 451 ret = o2net_send_message_vec(DLM_PROXY_AST_MSG, dlm->key, vec, veclen,
454 lock->ml.node, &status); 452 lock->ml.node, &status);
455 if (ret < 0) 453 if (ret < 0)
456 mlog_errno(ret); 454 mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
455 "node %u\n", ret, DLM_PROXY_AST_MSG, dlm->key,
456 lock->ml.node);
457 else { 457 else {
458 if (status == DLM_RECOVERING) { 458 if (status == DLM_RECOVERING) {
459 mlog(ML_ERROR, "sent AST to node %u, it thinks this " 459 mlog(ML_ERROR, "sent AST to node %u, it thinks this "
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 0102be35980c..4b6ae2c13b47 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -37,7 +37,7 @@
37#define DLM_THREAD_SHUFFLE_INTERVAL 5 // flush everything every 5 passes 37#define DLM_THREAD_SHUFFLE_INTERVAL 5 // flush everything every 5 passes
38#define DLM_THREAD_MS 200 // flush at least every 200 ms 38#define DLM_THREAD_MS 200 // flush at least every 200 ms
39 39
40#define DLM_HASH_SIZE_DEFAULT (1 << 14) 40#define DLM_HASH_SIZE_DEFAULT (1 << 17)
41#if DLM_HASH_SIZE_DEFAULT < PAGE_SIZE 41#if DLM_HASH_SIZE_DEFAULT < PAGE_SIZE
42# define DLM_HASH_PAGES 1 42# define DLM_HASH_PAGES 1
43#else 43#else
@@ -904,6 +904,8 @@ void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
904 904
905void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock); 905void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
906void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock); 906void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
907void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
908void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
907void dlm_do_local_ast(struct dlm_ctxt *dlm, 909void dlm_do_local_ast(struct dlm_ctxt *dlm,
908 struct dlm_lock_resource *res, 910 struct dlm_lock_resource *res,
909 struct dlm_lock *lock); 911 struct dlm_lock *lock);
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index f283bce776b4..9f30491e5e88 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -28,7 +28,6 @@
28#include <linux/module.h> 28#include <linux/module.h>
29#include <linux/fs.h> 29#include <linux/fs.h>
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/slab.h>
32#include <linux/highmem.h> 31#include <linux/highmem.h>
33#include <linux/init.h> 32#include <linux/init.h>
34#include <linux/sysctl.h> 33#include <linux/sysctl.h>
@@ -391,7 +390,9 @@ static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm,
391 } else if (ret != DLM_NORMAL && ret != DLM_NOTQUEUED) 390 } else if (ret != DLM_NORMAL && ret != DLM_NOTQUEUED)
392 dlm_error(ret); 391 dlm_error(ret);
393 } else { 392 } else {
394 mlog_errno(tmpret); 393 mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
394 "node %u\n", tmpret, DLM_CONVERT_LOCK_MSG, dlm->key,
395 res->owner);
395 if (dlm_is_host_down(tmpret)) { 396 if (dlm_is_host_down(tmpret)) {
396 /* instead of logging the same network error over 397 /* instead of logging the same network error over
397 * and over, sleep here and wait for the heartbeat 398 * and over, sleep here and wait for the heartbeat
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 988c9055fd4e..6b5a492e1749 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -511,7 +511,7 @@ static void __dlm_print_nodes(struct dlm_ctxt *dlm)
511 511
512 assert_spin_locked(&dlm->spinlock); 512 assert_spin_locked(&dlm->spinlock);
513 513
514 printk(KERN_INFO "ocfs2_dlm: Nodes in domain (\"%s\"): ", dlm->name); 514 printk(KERN_NOTICE "o2dlm: Nodes in domain %s: ", dlm->name);
515 515
516 while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES, 516 while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES,
517 node + 1)) < O2NM_MAX_NODES) { 517 node + 1)) < O2NM_MAX_NODES) {
@@ -534,7 +534,7 @@ static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
534 534
535 node = exit_msg->node_idx; 535 node = exit_msg->node_idx;
536 536
537 printk(KERN_INFO "ocfs2_dlm: Node %u leaves domain %s\n", node, dlm->name); 537 printk(KERN_NOTICE "o2dlm: Node %u leaves domain %s\n", node, dlm->name);
538 538
539 spin_lock(&dlm->spinlock); 539 spin_lock(&dlm->spinlock);
540 clear_bit(node, dlm->domain_map); 540 clear_bit(node, dlm->domain_map);
@@ -565,7 +565,9 @@ static int dlm_send_one_domain_exit(struct dlm_ctxt *dlm,
565 status = o2net_send_message(DLM_EXIT_DOMAIN_MSG, dlm->key, 565 status = o2net_send_message(DLM_EXIT_DOMAIN_MSG, dlm->key,
566 &leave_msg, sizeof(leave_msg), node, 566 &leave_msg, sizeof(leave_msg), node,
567 NULL); 567 NULL);
568 568 if (status < 0)
569 mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
570 "node %u\n", status, DLM_EXIT_DOMAIN_MSG, dlm->key, node);
569 mlog(0, "status return %d from o2net_send_message\n", status); 571 mlog(0, "status return %d from o2net_send_message\n", status);
570 572
571 return status; 573 return status;
@@ -904,7 +906,7 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
904 set_bit(assert->node_idx, dlm->domain_map); 906 set_bit(assert->node_idx, dlm->domain_map);
905 __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); 907 __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
906 908
907 printk(KERN_INFO "ocfs2_dlm: Node %u joins domain %s\n", 909 printk(KERN_NOTICE "o2dlm: Node %u joins domain %s\n",
908 assert->node_idx, dlm->name); 910 assert->node_idx, dlm->name);
909 __dlm_print_nodes(dlm); 911 __dlm_print_nodes(dlm);
910 912
@@ -962,7 +964,9 @@ static int dlm_send_one_join_cancel(struct dlm_ctxt *dlm,
962 &cancel_msg, sizeof(cancel_msg), node, 964 &cancel_msg, sizeof(cancel_msg), node,
963 NULL); 965 NULL);
964 if (status < 0) { 966 if (status < 0) {
965 mlog_errno(status); 967 mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
968 "node %u\n", status, DLM_CANCEL_JOIN_MSG, DLM_MOD_KEY,
969 node);
966 goto bail; 970 goto bail;
967 } 971 }
968 972
@@ -1029,10 +1033,11 @@ static int dlm_request_join(struct dlm_ctxt *dlm,
1029 byte_copymap(join_msg.node_map, dlm->live_nodes_map, O2NM_MAX_NODES); 1033 byte_copymap(join_msg.node_map, dlm->live_nodes_map, O2NM_MAX_NODES);
1030 1034
1031 status = o2net_send_message(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY, &join_msg, 1035 status = o2net_send_message(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY, &join_msg,
1032 sizeof(join_msg), node, 1036 sizeof(join_msg), node, &join_resp);
1033 &join_resp);
1034 if (status < 0 && status != -ENOPROTOOPT) { 1037 if (status < 0 && status != -ENOPROTOOPT) {
1035 mlog_errno(status); 1038 mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
1039 "node %u\n", status, DLM_QUERY_JOIN_MSG, DLM_MOD_KEY,
1040 node);
1036 goto bail; 1041 goto bail;
1037 } 1042 }
1038 dlm_query_join_wire_to_packet(join_resp, &packet); 1043 dlm_query_join_wire_to_packet(join_resp, &packet);
@@ -1103,7 +1108,9 @@ static int dlm_send_one_join_assert(struct dlm_ctxt *dlm,
1103 &assert_msg, sizeof(assert_msg), node, 1108 &assert_msg, sizeof(assert_msg), node,
1104 NULL); 1109 NULL);
1105 if (status < 0) 1110 if (status < 0)
1106 mlog_errno(status); 1111 mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
1112 "node %u\n", status, DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY,
1113 node);
1107 1114
1108 return status; 1115 return status;
1109} 1116}
@@ -1516,7 +1523,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
1516 goto leave; 1523 goto leave;
1517 } 1524 }
1518 1525
1519 dlm->name = kmalloc(strlen(domain) + 1, GFP_KERNEL); 1526 dlm->name = kstrdup(domain, GFP_KERNEL);
1520 if (dlm->name == NULL) { 1527 if (dlm->name == NULL) {
1521 mlog_errno(-ENOMEM); 1528 mlog_errno(-ENOMEM);
1522 kfree(dlm); 1529 kfree(dlm);
@@ -1550,7 +1557,6 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
1550 for (i = 0; i < DLM_HASH_BUCKETS; i++) 1557 for (i = 0; i < DLM_HASH_BUCKETS; i++)
1551 INIT_HLIST_HEAD(dlm_master_hash(dlm, i)); 1558 INIT_HLIST_HEAD(dlm_master_hash(dlm, i));
1552 1559
1553 strcpy(dlm->name, domain);
1554 dlm->key = key; 1560 dlm->key = key;
1555 dlm->node_num = o2nm_this_node(); 1561 dlm->node_num = o2nm_this_node();
1556 1562
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 733337772671..69cf369961c4 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -329,7 +329,9 @@ static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
329 BUG(); 329 BUG();
330 } 330 }
331 } else { 331 } else {
332 mlog_errno(tmpret); 332 mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
333 "node %u\n", tmpret, DLM_CREATE_LOCK_MSG, dlm->key,
334 res->owner);
333 if (dlm_is_host_down(tmpret)) { 335 if (dlm_is_host_down(tmpret)) {
334 ret = DLM_RECOVERING; 336 ret = DLM_RECOVERING;
335 mlog(0, "node %u died so returning DLM_RECOVERING " 337 mlog(0, "node %u died so returning DLM_RECOVERING "
@@ -429,7 +431,7 @@ struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie,
429 struct dlm_lock *lock; 431 struct dlm_lock *lock;
430 int kernel_allocated = 0; 432 int kernel_allocated = 0;
431 433
432 lock = (struct dlm_lock *) kmem_cache_zalloc(dlm_lock_cache, GFP_NOFS); 434 lock = kmem_cache_zalloc(dlm_lock_cache, GFP_NOFS);
433 if (!lock) 435 if (!lock)
434 return NULL; 436 return NULL;
435 437
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index a659606dcb95..4a7506a4e314 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -617,13 +617,11 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
617{ 617{
618 struct dlm_lock_resource *res = NULL; 618 struct dlm_lock_resource *res = NULL;
619 619
620 res = (struct dlm_lock_resource *) 620 res = kmem_cache_zalloc(dlm_lockres_cache, GFP_NOFS);
621 kmem_cache_zalloc(dlm_lockres_cache, GFP_NOFS);
622 if (!res) 621 if (!res)
623 goto error; 622 goto error;
624 623
625 res->lockname.name = (char *) 624 res->lockname.name = kmem_cache_zalloc(dlm_lockname_cache, GFP_NOFS);
626 kmem_cache_zalloc(dlm_lockname_cache, GFP_NOFS);
627 if (!res->lockname.name) 625 if (!res->lockname.name)
628 goto error; 626 goto error;
629 627
@@ -757,8 +755,7 @@ lookup:
757 spin_unlock(&dlm->spinlock); 755 spin_unlock(&dlm->spinlock);
758 mlog(0, "allocating a new resource\n"); 756 mlog(0, "allocating a new resource\n");
759 /* nothing found and we need to allocate one. */ 757 /* nothing found and we need to allocate one. */
760 alloc_mle = (struct dlm_master_list_entry *) 758 alloc_mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
761 kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
762 if (!alloc_mle) 759 if (!alloc_mle)
763 goto leave; 760 goto leave;
764 res = dlm_new_lockres(dlm, lockid, namelen); 761 res = dlm_new_lockres(dlm, lockid, namelen);
@@ -1542,8 +1539,7 @@ way_up_top:
1542 spin_unlock(&dlm->master_lock); 1539 spin_unlock(&dlm->master_lock);
1543 spin_unlock(&dlm->spinlock); 1540 spin_unlock(&dlm->spinlock);
1544 1541
1545 mle = (struct dlm_master_list_entry *) 1542 mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
1546 kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
1547 if (!mle) { 1543 if (!mle) {
1548 response = DLM_MASTER_RESP_ERROR; 1544 response = DLM_MASTER_RESP_ERROR;
1549 mlog_errno(-ENOMEM); 1545 mlog_errno(-ENOMEM);
@@ -1666,7 +1662,9 @@ again:
1666 tmpret = o2net_send_message(DLM_ASSERT_MASTER_MSG, dlm->key, 1662 tmpret = o2net_send_message(DLM_ASSERT_MASTER_MSG, dlm->key,
1667 &assert, sizeof(assert), to, &r); 1663 &assert, sizeof(assert), to, &r);
1668 if (tmpret < 0) { 1664 if (tmpret < 0) {
1669 mlog(0, "assert_master returned %d!\n", tmpret); 1665 mlog(ML_ERROR, "Error %d when sending message %u (key "
1666 "0x%x) to node %u\n", tmpret,
1667 DLM_ASSERT_MASTER_MSG, dlm->key, to);
1670 if (!dlm_is_host_down(tmpret)) { 1668 if (!dlm_is_host_down(tmpret)) {
1671 mlog(ML_ERROR, "unhandled error=%d!\n", tmpret); 1669 mlog(ML_ERROR, "unhandled error=%d!\n", tmpret);
1672 BUG(); 1670 BUG();
@@ -1875,7 +1873,6 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data,
1875ok: 1873ok:
1876 spin_unlock(&res->spinlock); 1874 spin_unlock(&res->spinlock);
1877 } 1875 }
1878 spin_unlock(&dlm->spinlock);
1879 1876
1880 // mlog(0, "woo! got an assert_master from node %u!\n", 1877 // mlog(0, "woo! got an assert_master from node %u!\n",
1881 // assert->node_idx); 1878 // assert->node_idx);
@@ -1926,7 +1923,6 @@ ok:
1926 /* master is known, detach if not already detached. 1923 /* master is known, detach if not already detached.
1927 * ensures that only one assert_master call will happen 1924 * ensures that only one assert_master call will happen
1928 * on this mle. */ 1925 * on this mle. */
1929 spin_lock(&dlm->spinlock);
1930 spin_lock(&dlm->master_lock); 1926 spin_lock(&dlm->master_lock);
1931 1927
1932 rr = atomic_read(&mle->mle_refs.refcount); 1928 rr = atomic_read(&mle->mle_refs.refcount);
@@ -1959,7 +1955,6 @@ ok:
1959 __dlm_put_mle(mle); 1955 __dlm_put_mle(mle);
1960 } 1956 }
1961 spin_unlock(&dlm->master_lock); 1957 spin_unlock(&dlm->master_lock);
1962 spin_unlock(&dlm->spinlock);
1963 } else if (res) { 1958 } else if (res) {
1964 if (res->owner != assert->node_idx) { 1959 if (res->owner != assert->node_idx) {
1965 mlog(0, "assert_master from %u, but current " 1960 mlog(0, "assert_master from %u, but current "
@@ -1967,6 +1962,7 @@ ok:
1967 res->owner, namelen, name); 1962 res->owner, namelen, name);
1968 } 1963 }
1969 } 1964 }
1965 spin_unlock(&dlm->spinlock);
1970 1966
1971done: 1967done:
1972 ret = 0; 1968 ret = 0;
@@ -2207,7 +2203,9 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
2207 ret = o2net_send_message(DLM_DEREF_LOCKRES_MSG, dlm->key, 2203 ret = o2net_send_message(DLM_DEREF_LOCKRES_MSG, dlm->key,
2208 &deref, sizeof(deref), res->owner, &r); 2204 &deref, sizeof(deref), res->owner, &r);
2209 if (ret < 0) 2205 if (ret < 0)
2210 mlog_errno(ret); 2206 mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
2207 "node %u\n", ret, DLM_DEREF_LOCKRES_MSG, dlm->key,
2208 res->owner);
2211 else if (r < 0) { 2209 else if (r < 0) {
2212 /* BAD. other node says I did not have a ref. */ 2210 /* BAD. other node says I did not have a ref. */
2213 mlog(ML_ERROR,"while dropping ref on %s:%.*s " 2211 mlog(ML_ERROR,"while dropping ref on %s:%.*s "
@@ -2454,8 +2452,7 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
2454 goto leave; 2452 goto leave;
2455 } 2453 }
2456 2454
2457 mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache, 2455 mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
2458 GFP_NOFS);
2459 if (!mle) { 2456 if (!mle) {
2460 mlog_errno(ret); 2457 mlog_errno(ret);
2461 goto leave; 2458 goto leave;
@@ -2977,7 +2974,9 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
2977 &migrate, sizeof(migrate), nodenum, 2974 &migrate, sizeof(migrate), nodenum,
2978 &status); 2975 &status);
2979 if (ret < 0) { 2976 if (ret < 0) {
2980 mlog(0, "migrate_request returned %d!\n", ret); 2977 mlog(ML_ERROR, "Error %d when sending message %u (key "
2978 "0x%x) to node %u\n", ret, DLM_MIGRATE_REQUEST_MSG,
2979 dlm->key, nodenum);
2981 if (!dlm_is_host_down(ret)) { 2980 if (!dlm_is_host_down(ret)) {
2982 mlog(ML_ERROR, "unhandled error=%d!\n", ret); 2981 mlog(ML_ERROR, "unhandled error=%d!\n", ret);
2983 BUG(); 2982 BUG();
@@ -3035,8 +3034,7 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
3035 hash = dlm_lockid_hash(name, namelen); 3034 hash = dlm_lockid_hash(name, namelen);
3036 3035
3037 /* preallocate.. if this fails, abort */ 3036 /* preallocate.. if this fails, abort */
3038 mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache, 3037 mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
3039 GFP_NOFS);
3040 3038
3041 if (!mle) { 3039 if (!mle) {
3042 ret = -ENOMEM; 3040 ret = -ENOMEM;
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index b4f99de2caf3..f8b75ce4be70 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -803,7 +803,9 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
803 803
804 /* negative status is handled by caller */ 804 /* negative status is handled by caller */
805 if (ret < 0) 805 if (ret < 0)
806 mlog_errno(ret); 806 mlog(ML_ERROR, "Error %d when sending message %u (key "
807 "0x%x) to node %u\n", ret, DLM_LOCK_REQUEST_MSG,
808 dlm->key, request_from);
807 809
808 // return from here, then 810 // return from here, then
809 // sleep until all received or error 811 // sleep until all received or error
@@ -955,10 +957,10 @@ static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to)
955 ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg, 957 ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg,
956 sizeof(done_msg), send_to, &tmpret); 958 sizeof(done_msg), send_to, &tmpret);
957 if (ret < 0) { 959 if (ret < 0) {
960 mlog(ML_ERROR, "Error %d when sending message %u (key "
961 "0x%x) to node %u\n", ret, DLM_RECO_DATA_DONE_MSG,
962 dlm->key, send_to);
958 if (!dlm_is_host_down(ret)) { 963 if (!dlm_is_host_down(ret)) {
959 mlog_errno(ret);
960 mlog(ML_ERROR, "%s: unknown error sending data-done "
961 "to %u\n", dlm->name, send_to);
962 BUG(); 964 BUG();
963 } 965 }
964 } else 966 } else
@@ -1126,7 +1128,9 @@ static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
1126 if (ret < 0) { 1128 if (ret < 0) {
1127 /* XXX: negative status is not handled. 1129 /* XXX: negative status is not handled.
1128 * this will end up killing this node. */ 1130 * this will end up killing this node. */
1129 mlog_errno(ret); 1131 mlog(ML_ERROR, "Error %d when sending message %u (key "
1132 "0x%x) to node %u\n", ret, DLM_MIG_LOCKRES_MSG,
1133 dlm->key, send_to);
1130 } else { 1134 } else {
1131 /* might get an -ENOMEM back here */ 1135 /* might get an -ENOMEM back here */
1132 ret = status; 1136 ret = status;
@@ -1642,7 +1646,9 @@ int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
1642 &req, sizeof(req), nodenum, &status); 1646 &req, sizeof(req), nodenum, &status);
1643 /* XXX: negative status not handled properly here. */ 1647 /* XXX: negative status not handled properly here. */
1644 if (ret < 0) 1648 if (ret < 0)
1645 mlog_errno(ret); 1649 mlog(ML_ERROR, "Error %d when sending message %u (key "
1650 "0x%x) to node %u\n", ret, DLM_MASTER_REQUERY_MSG,
1651 dlm->key, nodenum);
1646 else { 1652 else {
1647 BUG_ON(status < 0); 1653 BUG_ON(status < 0);
1648 BUG_ON(status > DLM_LOCK_RES_OWNER_UNKNOWN); 1654 BUG_ON(status > DLM_LOCK_RES_OWNER_UNKNOWN);
@@ -2640,7 +2646,7 @@ retry:
2640 if (dlm_is_host_down(ret)) { 2646 if (dlm_is_host_down(ret)) {
2641 /* node is down. not involved in recovery 2647 /* node is down. not involved in recovery
2642 * so just keep going */ 2648 * so just keep going */
2643 mlog(0, "%s: node %u was down when sending " 2649 mlog(ML_NOTICE, "%s: node %u was down when sending "
2644 "begin reco msg (%d)\n", dlm->name, nodenum, ret); 2650 "begin reco msg (%d)\n", dlm->name, nodenum, ret);
2645 ret = 0; 2651 ret = 0;
2646 } 2652 }
@@ -2660,11 +2666,12 @@ retry:
2660 } 2666 }
2661 if (ret < 0) { 2667 if (ret < 0) {
2662 struct dlm_lock_resource *res; 2668 struct dlm_lock_resource *res;
2669
2663 /* this is now a serious problem, possibly ENOMEM 2670 /* this is now a serious problem, possibly ENOMEM
2664 * in the network stack. must retry */ 2671 * in the network stack. must retry */
2665 mlog_errno(ret); 2672 mlog_errno(ret);
2666 mlog(ML_ERROR, "begin reco of dlm %s to node %u " 2673 mlog(ML_ERROR, "begin reco of dlm %s to node %u "
2667 " returned %d\n", dlm->name, nodenum, ret); 2674 "returned %d\n", dlm->name, nodenum, ret);
2668 res = dlm_lookup_lockres(dlm, DLM_RECOVERY_LOCK_NAME, 2675 res = dlm_lookup_lockres(dlm, DLM_RECOVERY_LOCK_NAME,
2669 DLM_RECOVERY_LOCK_NAME_LEN); 2676 DLM_RECOVERY_LOCK_NAME_LEN);
2670 if (res) { 2677 if (res) {
@@ -2789,7 +2796,9 @@ stage2:
2789 if (ret >= 0) 2796 if (ret >= 0)
2790 ret = status; 2797 ret = status;
2791 if (ret < 0) { 2798 if (ret < 0) {
2792 mlog_errno(ret); 2799 mlog(ML_ERROR, "Error %d when sending message %u (key "
2800 "0x%x) to node %u\n", ret, DLM_FINALIZE_RECO_MSG,
2801 dlm->key, nodenum);
2793 if (dlm_is_host_down(ret)) { 2802 if (dlm_is_host_down(ret)) {
2794 /* this has no effect on this recovery 2803 /* this has no effect on this recovery
2795 * session, so set the status to zero to 2804 * session, so set the status to zero to
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 52ec020ea78b..d4f73ca68fe5 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -28,7 +28,6 @@
28#include <linux/module.h> 28#include <linux/module.h>
29#include <linux/fs.h> 29#include <linux/fs.h>
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/slab.h>
32#include <linux/highmem.h> 31#include <linux/highmem.h>
33#include <linux/init.h> 32#include <linux/init.h>
34#include <linux/sysctl.h> 33#include <linux/sysctl.h>
@@ -310,6 +309,7 @@ static void dlm_shuffle_lists(struct dlm_ctxt *dlm,
310 * spinlock, and because we know that it is not migrating/ 309 * spinlock, and because we know that it is not migrating/
311 * recovering/in-progress, it is fine to reserve asts and 310 * recovering/in-progress, it is fine to reserve asts and
312 * basts right before queueing them all throughout */ 311 * basts right before queueing them all throughout */
312 assert_spin_locked(&dlm->ast_lock);
313 assert_spin_locked(&res->spinlock); 313 assert_spin_locked(&res->spinlock);
314 BUG_ON((res->state & (DLM_LOCK_RES_MIGRATING| 314 BUG_ON((res->state & (DLM_LOCK_RES_MIGRATING|
315 DLM_LOCK_RES_RECOVERING| 315 DLM_LOCK_RES_RECOVERING|
@@ -338,7 +338,7 @@ converting:
338 /* queue the BAST if not already */ 338 /* queue the BAST if not already */
339 if (lock->ml.highest_blocked == LKM_IVMODE) { 339 if (lock->ml.highest_blocked == LKM_IVMODE) {
340 __dlm_lockres_reserve_ast(res); 340 __dlm_lockres_reserve_ast(res);
341 dlm_queue_bast(dlm, lock); 341 __dlm_queue_bast(dlm, lock);
342 } 342 }
343 /* update the highest_blocked if needed */ 343 /* update the highest_blocked if needed */
344 if (lock->ml.highest_blocked < target->ml.convert_type) 344 if (lock->ml.highest_blocked < target->ml.convert_type)
@@ -356,7 +356,7 @@ converting:
356 can_grant = 0; 356 can_grant = 0;
357 if (lock->ml.highest_blocked == LKM_IVMODE) { 357 if (lock->ml.highest_blocked == LKM_IVMODE) {
358 __dlm_lockres_reserve_ast(res); 358 __dlm_lockres_reserve_ast(res);
359 dlm_queue_bast(dlm, lock); 359 __dlm_queue_bast(dlm, lock);
360 } 360 }
361 if (lock->ml.highest_blocked < target->ml.convert_type) 361 if (lock->ml.highest_blocked < target->ml.convert_type)
362 lock->ml.highest_blocked = 362 lock->ml.highest_blocked =
@@ -384,7 +384,7 @@ converting:
384 spin_unlock(&target->spinlock); 384 spin_unlock(&target->spinlock);
385 385
386 __dlm_lockres_reserve_ast(res); 386 __dlm_lockres_reserve_ast(res);
387 dlm_queue_ast(dlm, target); 387 __dlm_queue_ast(dlm, target);
388 /* go back and check for more */ 388 /* go back and check for more */
389 goto converting; 389 goto converting;
390 } 390 }
@@ -403,7 +403,7 @@ blocked:
403 can_grant = 0; 403 can_grant = 0;
404 if (lock->ml.highest_blocked == LKM_IVMODE) { 404 if (lock->ml.highest_blocked == LKM_IVMODE) {
405 __dlm_lockres_reserve_ast(res); 405 __dlm_lockres_reserve_ast(res);
406 dlm_queue_bast(dlm, lock); 406 __dlm_queue_bast(dlm, lock);
407 } 407 }
408 if (lock->ml.highest_blocked < target->ml.type) 408 if (lock->ml.highest_blocked < target->ml.type)
409 lock->ml.highest_blocked = target->ml.type; 409 lock->ml.highest_blocked = target->ml.type;
@@ -419,7 +419,7 @@ blocked:
419 can_grant = 0; 419 can_grant = 0;
420 if (lock->ml.highest_blocked == LKM_IVMODE) { 420 if (lock->ml.highest_blocked == LKM_IVMODE) {
421 __dlm_lockres_reserve_ast(res); 421 __dlm_lockres_reserve_ast(res);
422 dlm_queue_bast(dlm, lock); 422 __dlm_queue_bast(dlm, lock);
423 } 423 }
424 if (lock->ml.highest_blocked < target->ml.type) 424 if (lock->ml.highest_blocked < target->ml.type)
425 lock->ml.highest_blocked = target->ml.type; 425 lock->ml.highest_blocked = target->ml.type;
@@ -445,7 +445,7 @@ blocked:
445 spin_unlock(&target->spinlock); 445 spin_unlock(&target->spinlock);
446 446
447 __dlm_lockres_reserve_ast(res); 447 __dlm_lockres_reserve_ast(res);
448 dlm_queue_ast(dlm, target); 448 __dlm_queue_ast(dlm, target);
449 /* go back and check for more */ 449 /* go back and check for more */
450 goto converting; 450 goto converting;
451 } 451 }
@@ -675,6 +675,7 @@ static int dlm_thread(void *data)
675 /* lockres can be re-dirtied/re-added to the 675 /* lockres can be re-dirtied/re-added to the
676 * dirty_list in this gap, but that is ok */ 676 * dirty_list in this gap, but that is ok */
677 677
678 spin_lock(&dlm->ast_lock);
678 spin_lock(&res->spinlock); 679 spin_lock(&res->spinlock);
679 if (res->owner != dlm->node_num) { 680 if (res->owner != dlm->node_num) {
680 __dlm_print_one_lock_resource(res); 681 __dlm_print_one_lock_resource(res);
@@ -695,6 +696,7 @@ static int dlm_thread(void *data)
695 /* move it to the tail and keep going */ 696 /* move it to the tail and keep going */
696 res->state &= ~DLM_LOCK_RES_DIRTY; 697 res->state &= ~DLM_LOCK_RES_DIRTY;
697 spin_unlock(&res->spinlock); 698 spin_unlock(&res->spinlock);
699 spin_unlock(&dlm->ast_lock);
698 mlog(0, "delaying list shuffling for in-" 700 mlog(0, "delaying list shuffling for in-"
699 "progress lockres %.*s, state=%d\n", 701 "progress lockres %.*s, state=%d\n",
700 res->lockname.len, res->lockname.name, 702 res->lockname.len, res->lockname.name,
@@ -716,6 +718,7 @@ static int dlm_thread(void *data)
716 dlm_shuffle_lists(dlm, res); 718 dlm_shuffle_lists(dlm, res);
717 res->state &= ~DLM_LOCK_RES_DIRTY; 719 res->state &= ~DLM_LOCK_RES_DIRTY;
718 spin_unlock(&res->spinlock); 720 spin_unlock(&res->spinlock);
721 spin_unlock(&dlm->ast_lock);
719 722
720 dlm_lockres_calc_usage(dlm, res); 723 dlm_lockres_calc_usage(dlm, res);
721 724
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 49e29ecd0201..817287c6a6db 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -28,7 +28,6 @@
28#include <linux/module.h> 28#include <linux/module.h>
29#include <linux/fs.h> 29#include <linux/fs.h>
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/slab.h>
32#include <linux/highmem.h> 31#include <linux/highmem.h>
33#include <linux/init.h> 32#include <linux/init.h>
34#include <linux/sysctl.h> 33#include <linux/sysctl.h>
@@ -355,7 +354,8 @@ static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
355 mlog(0, "master was in-progress. retry\n"); 354 mlog(0, "master was in-progress. retry\n");
356 ret = status; 355 ret = status;
357 } else { 356 } else {
358 mlog_errno(tmpret); 357 mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
358 "node %u\n", tmpret, DLM_UNLOCK_LOCK_MSG, dlm->key, owner);
359 if (dlm_is_host_down(tmpret)) { 359 if (dlm_is_host_down(tmpret)) {
360 /* NOTE: this seems strange, but it is what we want. 360 /* NOTE: this seems strange, but it is what we want.
361 * when the master goes down during a cancel or 361 * when the master goes down during a cancel or
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 1b0de157a08c..b83d6107a1f5 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -112,20 +112,20 @@ MODULE_PARM_DESC(capabilities, DLMFS_CAPABILITIES);
112 * O_RDONLY -> PRMODE level 112 * O_RDONLY -> PRMODE level
113 * O_WRONLY -> EXMODE level 113 * O_WRONLY -> EXMODE level
114 * 114 *
115 * O_NONBLOCK -> LKM_NOQUEUE 115 * O_NONBLOCK -> NOQUEUE
116 */ 116 */
117static int dlmfs_decode_open_flags(int open_flags, 117static int dlmfs_decode_open_flags(int open_flags,
118 int *level, 118 int *level,
119 int *flags) 119 int *flags)
120{ 120{
121 if (open_flags & (O_WRONLY|O_RDWR)) 121 if (open_flags & (O_WRONLY|O_RDWR))
122 *level = LKM_EXMODE; 122 *level = DLM_LOCK_EX;
123 else 123 else
124 *level = LKM_PRMODE; 124 *level = DLM_LOCK_PR;
125 125
126 *flags = 0; 126 *flags = 0;
127 if (open_flags & O_NONBLOCK) 127 if (open_flags & O_NONBLOCK)
128 *flags |= LKM_NOQUEUE; 128 *flags |= DLM_LKF_NOQUEUE;
129 129
130 return 0; 130 return 0;
131} 131}
@@ -166,7 +166,7 @@ static int dlmfs_file_open(struct inode *inode,
166 * to be able userspace to be able to distinguish a 166 * to be able userspace to be able to distinguish a
167 * valid lock request from one that simply couldn't be 167 * valid lock request from one that simply couldn't be
168 * granted. */ 168 * granted. */
169 if (flags & LKM_NOQUEUE && status == -EAGAIN) 169 if (flags & DLM_LKF_NOQUEUE && status == -EAGAIN)
170 status = -ETXTBSY; 170 status = -ETXTBSY;
171 kfree(fp); 171 kfree(fp);
172 goto bail; 172 goto bail;
@@ -193,7 +193,7 @@ static int dlmfs_file_release(struct inode *inode,
193 status = 0; 193 status = 0;
194 if (fp) { 194 if (fp) {
195 level = fp->fp_lock_level; 195 level = fp->fp_lock_level;
196 if (level != LKM_IVMODE) 196 if (level != DLM_LOCK_IV)
197 user_dlm_cluster_unlock(&ip->ip_lockres, level); 197 user_dlm_cluster_unlock(&ip->ip_lockres, level);
198 198
199 kfree(fp); 199 kfree(fp);
@@ -262,7 +262,7 @@ static ssize_t dlmfs_file_read(struct file *filp,
262 if ((count + *ppos) > i_size_read(inode)) 262 if ((count + *ppos) > i_size_read(inode))
263 readlen = i_size_read(inode) - *ppos; 263 readlen = i_size_read(inode) - *ppos;
264 else 264 else
265 readlen = count - *ppos; 265 readlen = count;
266 266
267 lvb_buf = kmalloc(readlen, GFP_NOFS); 267 lvb_buf = kmalloc(readlen, GFP_NOFS);
268 if (!lvb_buf) 268 if (!lvb_buf)
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 50c4ee805da4..39eb16ac5f98 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -3897,7 +3897,8 @@ static int ocfs2_refresh_qinfo(struct ocfs2_mem_dqinfo *oinfo)
3897 oinfo->dqi_gi.dqi_free_entry = 3897 oinfo->dqi_gi.dqi_free_entry =
3898 be32_to_cpu(lvb->lvb_free_entry); 3898 be32_to_cpu(lvb->lvb_free_entry);
3899 } else { 3899 } else {
3900 status = ocfs2_read_quota_block(oinfo->dqi_gqinode, 0, &bh); 3900 status = ocfs2_read_quota_phys_block(oinfo->dqi_gqinode,
3901 oinfo->dqi_giblk, &bh);
3901 if (status) { 3902 if (status) {
3902 mlog_errno(status); 3903 mlog_errno(status);
3903 goto bail; 3904 goto bail;
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index c562a7581cf9..09e3fdfa6d33 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -24,6 +24,7 @@
24 24
25#include <linux/fs.h> 25#include <linux/fs.h>
26#include <linux/init.h> 26#include <linux/init.h>
27#include <linux/slab.h>
27#include <linux/types.h> 28#include <linux/types.h>
28#include <linux/fiemap.h> 29#include <linux/fiemap.h>
29 30
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 17947dc8341e..6a13ea64c447 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -175,13 +175,12 @@ static int ocfs2_dir_release(struct inode *inode, struct file *file)
175 return 0; 175 return 0;
176} 176}
177 177
178static int ocfs2_sync_file(struct file *file, 178static int ocfs2_sync_file(struct file *file, int datasync)
179 struct dentry *dentry,
180 int datasync)
181{ 179{
182 int err = 0; 180 int err = 0;
183 journal_t *journal; 181 journal_t *journal;
184 struct inode *inode = dentry->d_inode; 182 struct dentry *dentry = file->f_path.dentry;
183 struct inode *inode = file->f_mapping->host;
185 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 184 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
186 185
187 mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync, 186 mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync,
@@ -278,10 +277,7 @@ int ocfs2_update_inode_atime(struct inode *inode,
278 inode->i_atime = CURRENT_TIME; 277 inode->i_atime = CURRENT_TIME;
279 di->i_atime = cpu_to_le64(inode->i_atime.tv_sec); 278 di->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
280 di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); 279 di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
281 280 ocfs2_journal_dirty(handle, bh);
282 ret = ocfs2_journal_dirty(handle, bh);
283 if (ret < 0)
284 mlog_errno(ret);
285 281
286out_commit: 282out_commit:
287 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); 283 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
@@ -430,9 +426,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
430 di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec); 426 di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
431 di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); 427 di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
432 428
433 status = ocfs2_journal_dirty(handle, fe_bh); 429 ocfs2_journal_dirty(handle, fe_bh);
434 if (status < 0)
435 mlog_errno(status);
436 430
437out_commit: 431out_commit:
438 ocfs2_commit_trans(osb, handle); 432 ocfs2_commit_trans(osb, handle);
@@ -449,7 +443,6 @@ static int ocfs2_truncate_file(struct inode *inode,
449 int status = 0; 443 int status = 0;
450 struct ocfs2_dinode *fe = NULL; 444 struct ocfs2_dinode *fe = NULL;
451 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 445 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
452 struct ocfs2_truncate_context *tc = NULL;
453 446
454 mlog_entry("(inode = %llu, new_i_size = %llu\n", 447 mlog_entry("(inode = %llu, new_i_size = %llu\n",
455 (unsigned long long)OCFS2_I(inode)->ip_blkno, 448 (unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -488,6 +481,9 @@ static int ocfs2_truncate_file(struct inode *inode,
488 481
489 down_write(&OCFS2_I(inode)->ip_alloc_sem); 482 down_write(&OCFS2_I(inode)->ip_alloc_sem);
490 483
484 ocfs2_resv_discard(&osb->osb_la_resmap,
485 &OCFS2_I(inode)->ip_la_data_resv);
486
491 /* 487 /*
492 * The inode lock forced other nodes to sync and drop their 488 * The inode lock forced other nodes to sync and drop their
493 * pages, which (correctly) happens even if we have a truncate 489 * pages, which (correctly) happens even if we have a truncate
@@ -517,13 +513,7 @@ static int ocfs2_truncate_file(struct inode *inode,
517 goto bail_unlock_sem; 513 goto bail_unlock_sem;
518 } 514 }
519 515
520 status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc); 516 status = ocfs2_commit_truncate(osb, inode, di_bh);
521 if (status < 0) {
522 mlog_errno(status);
523 goto bail_unlock_sem;
524 }
525
526 status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
527 if (status < 0) { 517 if (status < 0) {
528 mlog_errno(status); 518 mlog_errno(status);
529 goto bail_unlock_sem; 519 goto bail_unlock_sem;
@@ -666,11 +656,7 @@ restarted_transaction:
666 goto leave; 656 goto leave;
667 } 657 }
668 658
669 status = ocfs2_journal_dirty(handle, bh); 659 ocfs2_journal_dirty(handle, bh);
670 if (status < 0) {
671 mlog_errno(status);
672 goto leave;
673 }
674 660
675 spin_lock(&OCFS2_I(inode)->ip_lock); 661 spin_lock(&OCFS2_I(inode)->ip_lock);
676 clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters); 662 clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
@@ -684,6 +670,7 @@ restarted_transaction:
684 if (why == RESTART_META) { 670 if (why == RESTART_META) {
685 mlog(0, "restarting function.\n"); 671 mlog(0, "restarting function.\n");
686 restart_func = 1; 672 restart_func = 1;
673 status = 0;
687 } else { 674 } else {
688 BUG_ON(why != RESTART_TRANS); 675 BUG_ON(why != RESTART_TRANS);
689 676
@@ -945,9 +932,8 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
945 struct ocfs2_super *osb = OCFS2_SB(sb); 932 struct ocfs2_super *osb = OCFS2_SB(sb);
946 struct buffer_head *bh = NULL; 933 struct buffer_head *bh = NULL;
947 handle_t *handle = NULL; 934 handle_t *handle = NULL;
948 int qtype;
949 struct dquot *transfer_from[MAXQUOTAS] = { };
950 struct dquot *transfer_to[MAXQUOTAS] = { }; 935 struct dquot *transfer_to[MAXQUOTAS] = { };
936 int qtype;
951 937
952 mlog_entry("(0x%p, '%.*s')\n", dentry, 938 mlog_entry("(0x%p, '%.*s')\n", dentry,
953 dentry->d_name.len, dentry->d_name.name); 939 dentry->d_name.len, dentry->d_name.name);
@@ -978,10 +964,10 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
978 if (status) 964 if (status)
979 return status; 965 return status;
980 966
967 if (is_quota_modification(inode, attr))
968 dquot_initialize(inode);
981 size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE; 969 size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
982 if (size_change) { 970 if (size_change) {
983 dquot_initialize(inode);
984
985 status = ocfs2_rw_lock(inode, 1); 971 status = ocfs2_rw_lock(inode, 1);
986 if (status < 0) { 972 if (status < 0) {
987 mlog_errno(status); 973 mlog_errno(status);
@@ -1031,9 +1017,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1031 OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) { 1017 OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
1032 transfer_to[USRQUOTA] = dqget(sb, attr->ia_uid, 1018 transfer_to[USRQUOTA] = dqget(sb, attr->ia_uid,
1033 USRQUOTA); 1019 USRQUOTA);
1034 transfer_from[USRQUOTA] = dqget(sb, inode->i_uid, 1020 if (!transfer_to[USRQUOTA]) {
1035 USRQUOTA);
1036 if (!transfer_to[USRQUOTA] || !transfer_from[USRQUOTA]) {
1037 status = -ESRCH; 1021 status = -ESRCH;
1038 goto bail_unlock; 1022 goto bail_unlock;
1039 } 1023 }
@@ -1043,9 +1027,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1043 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) { 1027 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
1044 transfer_to[GRPQUOTA] = dqget(sb, attr->ia_gid, 1028 transfer_to[GRPQUOTA] = dqget(sb, attr->ia_gid,
1045 GRPQUOTA); 1029 GRPQUOTA);
1046 transfer_from[GRPQUOTA] = dqget(sb, inode->i_gid, 1030 if (!transfer_to[GRPQUOTA]) {
1047 GRPQUOTA);
1048 if (!transfer_to[GRPQUOTA] || !transfer_from[GRPQUOTA]) {
1049 status = -ESRCH; 1031 status = -ESRCH;
1050 goto bail_unlock; 1032 goto bail_unlock;
1051 } 1033 }
@@ -1057,7 +1039,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1057 mlog_errno(status); 1039 mlog_errno(status);
1058 goto bail_unlock; 1040 goto bail_unlock;
1059 } 1041 }
1060 status = dquot_transfer(inode, attr); 1042 status = __dquot_transfer(inode, transfer_to);
1061 if (status < 0) 1043 if (status < 0)
1062 goto bail_commit; 1044 goto bail_commit;
1063 } else { 1045 } else {
@@ -1070,7 +1052,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1070 } 1052 }
1071 1053
1072 /* 1054 /*
1073 * This will intentionally not wind up calling vmtruncate(), 1055 * This will intentionally not wind up calling simple_setsize(),
1074 * since all the work for a size change has been done above. 1056 * since all the work for a size change has been done above.
1075 * Otherwise, we could get into problems with truncate as 1057 * Otherwise, we could get into problems with truncate as
1076 * ip_alloc_sem is used there to protect against i_size 1058 * ip_alloc_sem is used there to protect against i_size
@@ -1097,10 +1079,8 @@ bail:
1097 brelse(bh); 1079 brelse(bh);
1098 1080
1099 /* Release quota pointers in case we acquired them */ 1081 /* Release quota pointers in case we acquired them */
1100 for (qtype = 0; qtype < MAXQUOTAS; qtype++) { 1082 for (qtype = 0; qtype < MAXQUOTAS; qtype++)
1101 dqput(transfer_to[qtype]); 1083 dqput(transfer_to[qtype]);
1102 dqput(transfer_from[qtype]);
1103 }
1104 1084
1105 if (!status && attr->ia_valid & ATTR_MODE) { 1085 if (!status && attr->ia_valid & ATTR_MODE) {
1106 status = ocfs2_acl_chmod(inode); 1086 status = ocfs2_acl_chmod(inode);
@@ -1194,9 +1174,7 @@ static int __ocfs2_write_remove_suid(struct inode *inode,
1194 di = (struct ocfs2_dinode *) bh->b_data; 1174 di = (struct ocfs2_dinode *) bh->b_data;
1195 di->i_mode = cpu_to_le16(inode->i_mode); 1175 di->i_mode = cpu_to_le16(inode->i_mode);
1196 1176
1197 ret = ocfs2_journal_dirty(handle, bh); 1177 ocfs2_journal_dirty(handle, bh);
1198 if (ret < 0)
1199 mlog_errno(ret);
1200 1178
1201out_trans: 1179out_trans:
1202 ocfs2_commit_trans(osb, handle); 1180 ocfs2_commit_trans(osb, handle);
@@ -1433,16 +1411,90 @@ out:
1433 return ret; 1411 return ret;
1434} 1412}
1435 1413
1414static int ocfs2_find_rec(struct ocfs2_extent_list *el, u32 pos)
1415{
1416 int i;
1417 struct ocfs2_extent_rec *rec = NULL;
1418
1419 for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
1420
1421 rec = &el->l_recs[i];
1422
1423 if (le32_to_cpu(rec->e_cpos) < pos)
1424 break;
1425 }
1426
1427 return i;
1428}
1429
1430/*
1431 * Helper to calculate the punching pos and length in one run, we handle the
1432 * following three cases in order:
1433 *
1434 * - remove the entire record
1435 * - remove a partial record
1436 * - no record needs to be removed (hole-punching completed)
1437*/
1438static void ocfs2_calc_trunc_pos(struct inode *inode,
1439 struct ocfs2_extent_list *el,
1440 struct ocfs2_extent_rec *rec,
1441 u32 trunc_start, u32 *trunc_cpos,
1442 u32 *trunc_len, u32 *trunc_end,
1443 u64 *blkno, int *done)
1444{
1445 int ret = 0;
1446 u32 coff, range;
1447
1448 range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
1449
1450 if (le32_to_cpu(rec->e_cpos) >= trunc_start) {
1451 *trunc_cpos = le32_to_cpu(rec->e_cpos);
1452 /*
1453 * Skip holes if any.
1454 */
1455 if (range < *trunc_end)
1456 *trunc_end = range;
1457 *trunc_len = *trunc_end - le32_to_cpu(rec->e_cpos);
1458 *blkno = le64_to_cpu(rec->e_blkno);
1459 *trunc_end = le32_to_cpu(rec->e_cpos);
1460 } else if (range > trunc_start) {
1461 *trunc_cpos = trunc_start;
1462 *trunc_len = *trunc_end - trunc_start;
1463 coff = trunc_start - le32_to_cpu(rec->e_cpos);
1464 *blkno = le64_to_cpu(rec->e_blkno) +
1465 ocfs2_clusters_to_blocks(inode->i_sb, coff);
1466 *trunc_end = trunc_start;
1467 } else {
1468 /*
1469 * It may have two following possibilities:
1470 *
1471 * - last record has been removed
1472 * - trunc_start was within a hole
1473 *
1474 * both two cases mean the completion of hole punching.
1475 */
1476 ret = 1;
1477 }
1478
1479 *done = ret;
1480}
1481
1436static int ocfs2_remove_inode_range(struct inode *inode, 1482static int ocfs2_remove_inode_range(struct inode *inode,
1437 struct buffer_head *di_bh, u64 byte_start, 1483 struct buffer_head *di_bh, u64 byte_start,
1438 u64 byte_len) 1484 u64 byte_len)
1439{ 1485{
1440 int ret = 0; 1486 int ret = 0, flags = 0, done = 0, i;
1441 u32 trunc_start, trunc_len, cpos, phys_cpos, alloc_size; 1487 u32 trunc_start, trunc_len, trunc_end, trunc_cpos, phys_cpos;
1488 u32 cluster_in_el;
1442 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1489 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1443 struct ocfs2_cached_dealloc_ctxt dealloc; 1490 struct ocfs2_cached_dealloc_ctxt dealloc;
1444 struct address_space *mapping = inode->i_mapping; 1491 struct address_space *mapping = inode->i_mapping;
1445 struct ocfs2_extent_tree et; 1492 struct ocfs2_extent_tree et;
1493 struct ocfs2_path *path = NULL;
1494 struct ocfs2_extent_list *el = NULL;
1495 struct ocfs2_extent_rec *rec = NULL;
1496 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
1497 u64 blkno, refcount_loc = le64_to_cpu(di->i_refcount_loc);
1446 1498
1447 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh); 1499 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
1448 ocfs2_init_dealloc_ctxt(&dealloc); 1500 ocfs2_init_dealloc_ctxt(&dealloc);
@@ -1468,17 +1520,35 @@ static int ocfs2_remove_inode_range(struct inode *inode,
1468 goto out; 1520 goto out;
1469 } 1521 }
1470 1522
1523 /*
1524 * For reflinks, we may need to CoW 2 clusters which might be
1525 * partially zero'd later, if hole's start and end offset were
1526 * within one cluster(means is not exactly aligned to clustersize).
1527 */
1528
1529 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) {
1530
1531 ret = ocfs2_cow_file_pos(inode, di_bh, byte_start);
1532 if (ret) {
1533 mlog_errno(ret);
1534 goto out;
1535 }
1536
1537 ret = ocfs2_cow_file_pos(inode, di_bh, byte_start + byte_len);
1538 if (ret) {
1539 mlog_errno(ret);
1540 goto out;
1541 }
1542 }
1543
1471 trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start); 1544 trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start);
1472 trunc_len = (byte_start + byte_len) >> osb->s_clustersize_bits; 1545 trunc_end = (byte_start + byte_len) >> osb->s_clustersize_bits;
1473 if (trunc_len >= trunc_start) 1546 cluster_in_el = trunc_end;
1474 trunc_len -= trunc_start;
1475 else
1476 trunc_len = 0;
1477 1547
1478 mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u\n", 1548 mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, cend: %u\n",
1479 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1549 (unsigned long long)OCFS2_I(inode)->ip_blkno,
1480 (unsigned long long)byte_start, 1550 (unsigned long long)byte_start,
1481 (unsigned long long)byte_len, trunc_start, trunc_len); 1551 (unsigned long long)byte_len, trunc_start, trunc_end);
1482 1552
1483 ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len); 1553 ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len);
1484 if (ret) { 1554 if (ret) {
@@ -1486,31 +1556,79 @@ static int ocfs2_remove_inode_range(struct inode *inode,
1486 goto out; 1556 goto out;
1487 } 1557 }
1488 1558
1489 cpos = trunc_start; 1559 path = ocfs2_new_path_from_et(&et);
1490 while (trunc_len) { 1560 if (!path) {
1491 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, 1561 ret = -ENOMEM;
1492 &alloc_size, NULL); 1562 mlog_errno(ret);
1563 goto out;
1564 }
1565
1566 while (trunc_end > trunc_start) {
1567
1568 ret = ocfs2_find_path(INODE_CACHE(inode), path,
1569 cluster_in_el);
1493 if (ret) { 1570 if (ret) {
1494 mlog_errno(ret); 1571 mlog_errno(ret);
1495 goto out; 1572 goto out;
1496 } 1573 }
1497 1574
1498 if (alloc_size > trunc_len) 1575 el = path_leaf_el(path);
1499 alloc_size = trunc_len; 1576
1577 i = ocfs2_find_rec(el, trunc_end);
1578 /*
1579 * Need to go to previous extent block.
1580 */
1581 if (i < 0) {
1582 if (path->p_tree_depth == 0)
1583 break;
1500 1584
1501 /* Only do work for non-holes */ 1585 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
1502 if (phys_cpos != 0) { 1586 path,
1503 ret = ocfs2_remove_btree_range(inode, &et, cpos, 1587 &cluster_in_el);
1504 phys_cpos, alloc_size,
1505 &dealloc);
1506 if (ret) { 1588 if (ret) {
1507 mlog_errno(ret); 1589 mlog_errno(ret);
1508 goto out; 1590 goto out;
1509 } 1591 }
1592
1593 /*
1594 * We've reached the leftmost extent block,
1595 * it's safe to leave.
1596 */
1597 if (cluster_in_el == 0)
1598 break;
1599
1600 /*
1601 * The 'pos' searched for previous extent block is
1602 * always one cluster less than actual trunc_end.
1603 */
1604 trunc_end = cluster_in_el + 1;
1605
1606 ocfs2_reinit_path(path, 1);
1607
1608 continue;
1609
1610 } else
1611 rec = &el->l_recs[i];
1612
1613 ocfs2_calc_trunc_pos(inode, el, rec, trunc_start, &trunc_cpos,
1614 &trunc_len, &trunc_end, &blkno, &done);
1615 if (done)
1616 break;
1617
1618 flags = rec->e_flags;
1619 phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
1620
1621 ret = ocfs2_remove_btree_range(inode, &et, trunc_cpos,
1622 phys_cpos, trunc_len, flags,
1623 &dealloc, refcount_loc);
1624 if (ret < 0) {
1625 mlog_errno(ret);
1626 goto out;
1510 } 1627 }
1511 1628
1512 cpos += alloc_size; 1629 cluster_in_el = trunc_end;
1513 trunc_len -= alloc_size; 1630
1631 ocfs2_reinit_path(path, 1);
1514 } 1632 }
1515 1633
1516 ocfs2_truncate_cluster_pages(inode, byte_start, byte_len); 1634 ocfs2_truncate_cluster_pages(inode, byte_start, byte_len);
@@ -1981,18 +2099,18 @@ relock:
1981 /* communicate with ocfs2_dio_end_io */ 2099 /* communicate with ocfs2_dio_end_io */
1982 ocfs2_iocb_set_rw_locked(iocb, rw_level); 2100 ocfs2_iocb_set_rw_locked(iocb, rw_level);
1983 2101
1984 if (direct_io) { 2102 ret = generic_segment_checks(iov, &nr_segs, &ocount,
1985 ret = generic_segment_checks(iov, &nr_segs, &ocount, 2103 VERIFY_READ);
1986 VERIFY_READ); 2104 if (ret)
1987 if (ret) 2105 goto out_dio;
1988 goto out_dio;
1989 2106
1990 count = ocount; 2107 count = ocount;
1991 ret = generic_write_checks(file, ppos, &count, 2108 ret = generic_write_checks(file, ppos, &count,
1992 S_ISBLK(inode->i_mode)); 2109 S_ISBLK(inode->i_mode));
1993 if (ret) 2110 if (ret)
1994 goto out_dio; 2111 goto out_dio;
1995 2112
2113 if (direct_io) {
1996 written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos, 2114 written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
1997 ppos, count, ocount); 2115 ppos, count, ocount);
1998 if (written < 0) { 2116 if (written < 0) {
@@ -2000,14 +2118,21 @@ relock:
2000 * direct write may have instantiated a few 2118 * direct write may have instantiated a few
2001 * blocks outside i_size. Trim these off again. 2119 * blocks outside i_size. Trim these off again.
2002 * Don't need i_size_read because we hold i_mutex. 2120 * Don't need i_size_read because we hold i_mutex.
2121 *
2122 * XXX(hch): this looks buggy because ocfs2 did not
2123 * actually implement ->truncate. Take a look at
2124 * the new truncate sequence and update this accordingly
2003 */ 2125 */
2004 if (*ppos + count > inode->i_size) 2126 if (*ppos + count > inode->i_size)
2005 vmtruncate(inode, inode->i_size); 2127 simple_setsize(inode, inode->i_size);
2006 ret = written; 2128 ret = written;
2007 goto out_dio; 2129 goto out_dio;
2008 } 2130 }
2009 } else { 2131 } else {
2010 written = __generic_file_aio_write(iocb, iov, nr_segs, ppos); 2132 current->backing_dev_info = file->f_mapping->backing_dev_info;
2133 written = generic_file_buffered_write(iocb, iov, nr_segs, *ppos,
2134 ppos, count, 0);
2135 current->backing_dev_info = NULL;
2011 } 2136 }
2012 2137
2013out_dio: 2138out_dio:
@@ -2021,9 +2146,9 @@ out_dio:
2021 if (ret < 0) 2146 if (ret < 0)
2022 written = ret; 2147 written = ret;
2023 2148
2024 if (!ret && (old_size != i_size_read(inode) || 2149 if (!ret && ((old_size != i_size_read(inode)) ||
2025 old_clusters != OCFS2_I(inode)->ip_clusters || 2150 (old_clusters != OCFS2_I(inode)->ip_clusters) ||
2026 has_refcount)) { 2151 has_refcount)) {
2027 ret = jbd2_journal_force_commit(osb->journal->j_journal); 2152 ret = jbd2_journal_force_commit(osb->journal->j_journal);
2028 if (ret < 0) 2153 if (ret < 0)
2029 written = ret; 2154 written = ret;
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index c6e7213db868..1aa863dd901f 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -26,7 +26,6 @@
26 26
27#include <linux/fs.h> 27#include <linux/fs.h>
28#include <linux/types.h> 28#include <linux/types.h>
29#include <linux/slab.h>
30#include <linux/highmem.h> 29#include <linux/highmem.h>
31 30
32#define MLOG_MASK_PREFIX ML_SUPER 31#define MLOG_MASK_PREFIX ML_SUPER
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 278a223aae14..abb0a95cc717 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -25,7 +25,6 @@
25 25
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/types.h> 27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h> 28#include <linux/highmem.h>
30#include <linux/pagemap.h> 29#include <linux/pagemap.h>
31#include <linux/quotaops.h> 30#include <linux/quotaops.h>
@@ -377,6 +376,10 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
377 376
378 OCFS2_I(inode)->ip_last_used_slot = 0; 377 OCFS2_I(inode)->ip_last_used_slot = 0;
379 OCFS2_I(inode)->ip_last_used_group = 0; 378 OCFS2_I(inode)->ip_last_used_group = 0;
379
380 if (S_ISDIR(inode->i_mode))
381 ocfs2_resv_set_type(&OCFS2_I(inode)->ip_la_data_resv,
382 OCFS2_RESV_FLAG_DIR);
380 mlog_exit_void(); 383 mlog_exit_void();
381} 384}
382 385
@@ -540,7 +543,6 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
540 struct buffer_head *fe_bh) 543 struct buffer_head *fe_bh)
541{ 544{
542 int status = 0; 545 int status = 0;
543 struct ocfs2_truncate_context *tc = NULL;
544 struct ocfs2_dinode *fe; 546 struct ocfs2_dinode *fe;
545 handle_t *handle = NULL; 547 handle_t *handle = NULL;
546 548
@@ -559,6 +561,7 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
559 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 561 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
560 if (IS_ERR(handle)) { 562 if (IS_ERR(handle)) {
561 status = PTR_ERR(handle); 563 status = PTR_ERR(handle);
564 handle = NULL;
562 mlog_errno(status); 565 mlog_errno(status);
563 goto out; 566 goto out;
564 } 567 }
@@ -582,13 +585,7 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
582 ocfs2_commit_trans(osb, handle); 585 ocfs2_commit_trans(osb, handle);
583 handle = NULL; 586 handle = NULL;
584 587
585 status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc); 588 status = ocfs2_commit_truncate(osb, inode, fe_bh);
586 if (status < 0) {
587 mlog_errno(status);
588 goto out;
589 }
590
591 status = ocfs2_commit_truncate(osb, inode, fe_bh, tc);
592 if (status < 0) { 589 if (status < 0) {
593 mlog_errno(status); 590 mlog_errno(status);
594 goto out; 591 goto out;
@@ -640,11 +637,13 @@ static int ocfs2_remove_inode(struct inode *inode,
640 goto bail_unlock; 637 goto bail_unlock;
641 } 638 }
642 639
643 status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode, 640 if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) {
644 orphan_dir_bh); 641 status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode,
645 if (status < 0) { 642 orphan_dir_bh);
646 mlog_errno(status); 643 if (status < 0) {
647 goto bail_commit; 644 mlog_errno(status);
645 goto bail_commit;
646 }
648 } 647 }
649 648
650 /* set the inodes dtime */ 649 /* set the inodes dtime */
@@ -657,12 +656,7 @@ static int ocfs2_remove_inode(struct inode *inode,
657 656
658 di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec); 657 di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec);
659 di->i_flags &= cpu_to_le32(~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL)); 658 di->i_flags &= cpu_to_le32(~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL));
660 659 ocfs2_journal_dirty(handle, di_bh);
661 status = ocfs2_journal_dirty(handle, di_bh);
662 if (status < 0) {
663 mlog_errno(status);
664 goto bail_commit;
665 }
666 660
667 ocfs2_remove_from_cache(INODE_CACHE(inode), di_bh); 661 ocfs2_remove_from_cache(INODE_CACHE(inode), di_bh);
668 dquot_free_inode(inode); 662 dquot_free_inode(inode);
@@ -723,38 +717,39 @@ static void ocfs2_signal_wipe_completion(struct ocfs2_super *osb,
723static int ocfs2_wipe_inode(struct inode *inode, 717static int ocfs2_wipe_inode(struct inode *inode,
724 struct buffer_head *di_bh) 718 struct buffer_head *di_bh)
725{ 719{
726 int status, orphaned_slot; 720 int status, orphaned_slot = -1;
727 struct inode *orphan_dir_inode = NULL; 721 struct inode *orphan_dir_inode = NULL;
728 struct buffer_head *orphan_dir_bh = NULL; 722 struct buffer_head *orphan_dir_bh = NULL;
729 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 723 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
730 struct ocfs2_dinode *di; 724 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
731 725
732 di = (struct ocfs2_dinode *) di_bh->b_data; 726 if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) {
733 orphaned_slot = le16_to_cpu(di->i_orphaned_slot); 727 orphaned_slot = le16_to_cpu(di->i_orphaned_slot);
734 728
735 status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot); 729 status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot);
736 if (status) 730 if (status)
737 return status; 731 return status;
738 732
739 orphan_dir_inode = ocfs2_get_system_file_inode(osb, 733 orphan_dir_inode = ocfs2_get_system_file_inode(osb,
740 ORPHAN_DIR_SYSTEM_INODE, 734 ORPHAN_DIR_SYSTEM_INODE,
741 orphaned_slot); 735 orphaned_slot);
742 if (!orphan_dir_inode) { 736 if (!orphan_dir_inode) {
743 status = -EEXIST; 737 status = -EEXIST;
744 mlog_errno(status); 738 mlog_errno(status);
745 goto bail; 739 goto bail;
746 } 740 }
747 741
748 /* Lock the orphan dir. The lock will be held for the entire 742 /* Lock the orphan dir. The lock will be held for the entire
749 * delete_inode operation. We do this now to avoid races with 743 * delete_inode operation. We do this now to avoid races with
750 * recovery completion on other nodes. */ 744 * recovery completion on other nodes. */
751 mutex_lock(&orphan_dir_inode->i_mutex); 745 mutex_lock(&orphan_dir_inode->i_mutex);
752 status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); 746 status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
753 if (status < 0) { 747 if (status < 0) {
754 mutex_unlock(&orphan_dir_inode->i_mutex); 748 mutex_unlock(&orphan_dir_inode->i_mutex);
755 749
756 mlog_errno(status); 750 mlog_errno(status);
757 goto bail; 751 goto bail;
752 }
758 } 753 }
759 754
760 /* we do this while holding the orphan dir lock because we 755 /* we do this while holding the orphan dir lock because we
@@ -795,6 +790,9 @@ static int ocfs2_wipe_inode(struct inode *inode,
795 mlog_errno(status); 790 mlog_errno(status);
796 791
797bail_unlock_dir: 792bail_unlock_dir:
793 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)
794 return status;
795
798 ocfs2_inode_unlock(orphan_dir_inode, 1); 796 ocfs2_inode_unlock(orphan_dir_inode, 1);
799 mutex_unlock(&orphan_dir_inode->i_mutex); 797 mutex_unlock(&orphan_dir_inode->i_mutex);
800 brelse(orphan_dir_bh); 798 brelse(orphan_dir_bh);
@@ -890,7 +888,23 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
890 888
891 /* Do some basic inode verification... */ 889 /* Do some basic inode verification... */
892 di = (struct ocfs2_dinode *) di_bh->b_data; 890 di = (struct ocfs2_dinode *) di_bh->b_data;
893 if (!(di->i_flags & cpu_to_le32(OCFS2_ORPHANED_FL))) { 891 if (!(di->i_flags & cpu_to_le32(OCFS2_ORPHANED_FL)) &&
892 !(oi->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) {
893 /*
894 * Inodes in the orphan dir must have ORPHANED_FL. The only
895 * inodes that come back out of the orphan dir are reflink
896 * targets. A reflink target may be moved out of the orphan
897 * dir between the time we scan the directory and the time we
898 * process it. This would lead to HAS_REFCOUNT_FL being set but
899 * ORPHANED_FL not.
900 */
901 if (di->i_dyn_features & cpu_to_le16(OCFS2_HAS_REFCOUNT_FL)) {
902 mlog(0, "Reflinked inode %llu is no longer orphaned. "
903 "it shouldn't be deleted\n",
904 (unsigned long long)oi->ip_blkno);
905 goto bail;
906 }
907
894 /* for lack of a better error? */ 908 /* for lack of a better error? */
895 status = -EEXIST; 909 status = -EEXIST;
896 mlog(ML_ERROR, 910 mlog(ML_ERROR,
@@ -958,7 +972,7 @@ static void ocfs2_cleanup_delete_inode(struct inode *inode,
958void ocfs2_delete_inode(struct inode *inode) 972void ocfs2_delete_inode(struct inode *inode)
959{ 973{
960 int wipe, status; 974 int wipe, status;
961 sigset_t blocked, oldset; 975 sigset_t oldset;
962 struct buffer_head *di_bh = NULL; 976 struct buffer_head *di_bh = NULL;
963 977
964 mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino); 978 mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
@@ -985,13 +999,7 @@ void ocfs2_delete_inode(struct inode *inode)
985 * messaging paths may return us -ERESTARTSYS. Which would 999 * messaging paths may return us -ERESTARTSYS. Which would
986 * cause us to exit early, resulting in inodes being orphaned 1000 * cause us to exit early, resulting in inodes being orphaned
987 * forever. */ 1001 * forever. */
988 sigfillset(&blocked); 1002 ocfs2_block_signals(&oldset);
989 status = sigprocmask(SIG_BLOCK, &blocked, &oldset);
990 if (status < 0) {
991 mlog_errno(status);
992 ocfs2_cleanup_delete_inode(inode, 1);
993 goto bail;
994 }
995 1003
996 /* 1004 /*
997 * Synchronize us against ocfs2_get_dentry. We take this in 1005 * Synchronize us against ocfs2_get_dentry. We take this in
@@ -1065,9 +1073,7 @@ bail_unlock_nfs_sync:
1065 ocfs2_nfs_sync_unlock(OCFS2_SB(inode->i_sb), 0); 1073 ocfs2_nfs_sync_unlock(OCFS2_SB(inode->i_sb), 0);
1066 1074
1067bail_unblock: 1075bail_unblock:
1068 status = sigprocmask(SIG_SETMASK, &oldset, NULL); 1076 ocfs2_unblock_signals(&oldset);
1069 if (status < 0)
1070 mlog_errno(status);
1071bail: 1077bail:
1072 clear_inode(inode); 1078 clear_inode(inode);
1073 mlog_exit_void(); 1079 mlog_exit_void();
@@ -1101,6 +1107,10 @@ void ocfs2_clear_inode(struct inode *inode)
1101 ocfs2_mark_lockres_freeing(&oi->ip_inode_lockres); 1107 ocfs2_mark_lockres_freeing(&oi->ip_inode_lockres);
1102 ocfs2_mark_lockres_freeing(&oi->ip_open_lockres); 1108 ocfs2_mark_lockres_freeing(&oi->ip_open_lockres);
1103 1109
1110 ocfs2_resv_discard(&OCFS2_SB(inode->i_sb)->osb_la_resmap,
1111 &oi->ip_la_data_resv);
1112 ocfs2_resv_init_once(&oi->ip_la_data_resv);
1113
1104 /* We very well may get a clear_inode before all an inodes 1114 /* We very well may get a clear_inode before all an inodes
1105 * metadata has hit disk. Of course, we can't drop any cluster 1115 * metadata has hit disk. Of course, we can't drop any cluster
1106 * locks until the journal has finished with it. The only 1116 * locks until the journal has finished with it. The only
@@ -1276,13 +1286,8 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
1276 fe->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec); 1286 fe->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
1277 fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); 1287 fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
1278 1288
1279 status = ocfs2_journal_dirty(handle, bh); 1289 ocfs2_journal_dirty(handle, bh);
1280 if (status < 0)
1281 mlog_errno(status);
1282
1283 status = 0;
1284leave: 1290leave:
1285
1286 mlog_exit(status); 1291 mlog_exit(status);
1287 return status; 1292 return status;
1288} 1293}
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index ba4fe07b293c..9f5f5fcadc45 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -70,6 +70,8 @@ struct ocfs2_inode_info
70 /* Only valid if the inode is the dir. */ 70 /* Only valid if the inode is the dir. */
71 u32 ip_last_used_slot; 71 u32 ip_last_used_slot;
72 u64 ip_last_used_group; 72 u64 ip_last_used_group;
73
74 struct ocfs2_alloc_reservation ip_la_data_resv;
73}; 75};
74 76
75/* 77/*
@@ -100,6 +102,8 @@ struct ocfs2_inode_info
100#define OCFS2_INODE_MAYBE_ORPHANED 0x00000020 102#define OCFS2_INODE_MAYBE_ORPHANED 0x00000020
101/* Does someone have the file open O_DIRECT */ 103/* Does someone have the file open O_DIRECT */
102#define OCFS2_INODE_OPEN_DIRECT 0x00000040 104#define OCFS2_INODE_OPEN_DIRECT 0x00000040
105/* Tell the inode wipe code it's not in orphan dir */
106#define OCFS2_INODE_SKIP_ORPHAN_DIR 0x00000080
103 107
104static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode) 108static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode)
105{ 109{
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 9336c60e3a36..47878cf16418 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -402,9 +402,7 @@ int ocfs2_commit_trans(struct ocfs2_super *osb,
402} 402}
403 403
404/* 404/*
405 * 'nblocks' is what you want to add to the current 405 * 'nblocks' is what you want to add to the current transaction.
406 * transaction. extend_trans will either extend the current handle by
407 * nblocks, or commit it and start a new one with nblocks credits.
408 * 406 *
409 * This might call jbd2_journal_restart() which will commit dirty buffers 407 * This might call jbd2_journal_restart() which will commit dirty buffers
410 * and then restart the transaction. Before calling 408 * and then restart the transaction. Before calling
@@ -422,11 +420,15 @@ int ocfs2_commit_trans(struct ocfs2_super *osb,
422 */ 420 */
423int ocfs2_extend_trans(handle_t *handle, int nblocks) 421int ocfs2_extend_trans(handle_t *handle, int nblocks)
424{ 422{
425 int status; 423 int status, old_nblocks;
426 424
427 BUG_ON(!handle); 425 BUG_ON(!handle);
428 BUG_ON(!nblocks); 426 BUG_ON(nblocks < 0);
427
428 if (!nblocks)
429 return 0;
429 430
431 old_nblocks = handle->h_buffer_credits;
430 mlog_entry_void(); 432 mlog_entry_void();
431 433
432 mlog(0, "Trying to extend transaction by %d blocks\n", nblocks); 434 mlog(0, "Trying to extend transaction by %d blocks\n", nblocks);
@@ -445,7 +447,8 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks)
445 mlog(0, 447 mlog(0,
446 "jbd2_journal_extend failed, trying " 448 "jbd2_journal_extend failed, trying "
447 "jbd2_journal_restart\n"); 449 "jbd2_journal_restart\n");
448 status = jbd2_journal_restart(handle, nblocks); 450 status = jbd2_journal_restart(handle,
451 old_nblocks + nblocks);
449 if (status < 0) { 452 if (status < 0) {
450 mlog_errno(status); 453 mlog_errno(status);
451 goto bail; 454 goto bail;
@@ -734,8 +737,7 @@ int ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci,
734 return __ocfs2_journal_access(handle, ci, bh, NULL, type); 737 return __ocfs2_journal_access(handle, ci, bh, NULL, type);
735} 738}
736 739
737int ocfs2_journal_dirty(handle_t *handle, 740void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh)
738 struct buffer_head *bh)
739{ 741{
740 int status; 742 int status;
741 743
@@ -743,13 +745,9 @@ int ocfs2_journal_dirty(handle_t *handle,
743 (unsigned long long)bh->b_blocknr); 745 (unsigned long long)bh->b_blocknr);
744 746
745 status = jbd2_journal_dirty_metadata(handle, bh); 747 status = jbd2_journal_dirty_metadata(handle, bh);
746 if (status < 0) 748 BUG_ON(status);
747 mlog(ML_ERROR, "Could not dirty metadata buffer. "
748 "(bh->b_blocknr=%llu)\n",
749 (unsigned long long)bh->b_blocknr);
750 749
751 mlog_exit(status); 750 mlog_exit_void();
752 return status;
753} 751}
754 752
755#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE) 753#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE)
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 3f74e09b0d80..b5baaa8e710f 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -325,8 +325,7 @@ int ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci,
325 * <modify the bh> 325 * <modify the bh>
326 * ocfs2_journal_dirty(handle, bh); 326 * ocfs2_journal_dirty(handle, bh);
327 */ 327 */
328int ocfs2_journal_dirty(handle_t *handle, 328void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh);
329 struct buffer_head *bh);
330 329
331/* 330/*
332 * Credit Macros: 331 * Credit Macros:
@@ -562,6 +561,18 @@ static inline int ocfs2_calc_group_alloc_credits(struct super_block *sb,
562 return blocks; 561 return blocks;
563} 562}
564 563
564/*
565 * Allocating a discontiguous block group requires the credits from
566 * ocfs2_calc_group_alloc_credits() as well as enough credits to fill
567 * the group descriptor's extent list. The caller already has started
568 * the transaction with ocfs2_calc_group_alloc_credits(). They extend
569 * it with these credits.
570 */
571static inline int ocfs2_calc_bg_discontig_credits(struct super_block *sb)
572{
573 return ocfs2_extent_recs_per_gd(sb);
574}
575
565static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb, 576static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
566 unsigned int clusters_to_del, 577 unsigned int clusters_to_del,
567 struct ocfs2_dinode *fe, 578 struct ocfs2_dinode *fe,
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index ca992d91f511..3d7419682dc0 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -52,7 +52,8 @@ static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc);
52 52
53static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb, 53static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
54 struct ocfs2_dinode *alloc, 54 struct ocfs2_dinode *alloc,
55 u32 numbits); 55 u32 *numbits,
56 struct ocfs2_alloc_reservation *resv);
56 57
57static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc); 58static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc);
58 59
@@ -74,6 +75,144 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
74static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb, 75static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
75 struct inode *local_alloc_inode); 76 struct inode *local_alloc_inode);
76 77
78/*
79 * ocfs2_la_default_mb() - determine a default size, in megabytes of
80 * the local alloc.
81 *
82 * Generally, we'd like to pick as large a local alloc as
83 * possible. Performance on large workloads tends to scale
84 * proportionally to la size. In addition to that, the reservations
85 * code functions more efficiently as it can reserve more windows for
86 * write.
87 *
88 * Some things work against us when trying to choose a large local alloc:
89 *
90 * - We need to ensure our sizing is picked to leave enough space in
91 * group descriptors for other allocations (such as block groups,
92 * etc). Picking default sizes which are a multiple of 4 could help
93 * - block groups are allocated in 2mb and 4mb chunks.
94 *
95 * - Likewise, we don't want to starve other nodes of bits on small
96 * file systems. This can easily be taken care of by limiting our
97 * default to a reasonable size (256M) on larger cluster sizes.
98 *
99 * - Some file systems can't support very large sizes - 4k and 8k in
100 * particular are limited to less than 128 and 256 megabytes respectively.
101 *
102 * The following reference table shows group descriptor and local
103 * alloc maximums at various cluster sizes (4k blocksize)
104 *
105 * csize: 4K group: 126M la: 121M
106 * csize: 8K group: 252M la: 243M
107 * csize: 16K group: 504M la: 486M
108 * csize: 32K group: 1008M la: 972M
109 * csize: 64K group: 2016M la: 1944M
110 * csize: 128K group: 4032M la: 3888M
111 * csize: 256K group: 8064M la: 7776M
112 * csize: 512K group: 16128M la: 15552M
113 * csize: 1024K group: 32256M la: 31104M
114 */
115#define OCFS2_LA_MAX_DEFAULT_MB 256
116#define OCFS2_LA_OLD_DEFAULT 8
117unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb)
118{
119 unsigned int la_mb;
120 unsigned int gd_mb;
121 unsigned int megs_per_slot;
122 struct super_block *sb = osb->sb;
123
124 gd_mb = ocfs2_clusters_to_megabytes(osb->sb,
125 8 * ocfs2_group_bitmap_size(sb, 0, osb->s_feature_incompat));
126
127 /*
128 * This takes care of files systems with very small group
129 * descriptors - 512 byte blocksize at cluster sizes lower
130 * than 16K and also 1k blocksize with 4k cluster size.
131 */
132 if ((sb->s_blocksize == 512 && osb->s_clustersize <= 8192)
133 || (sb->s_blocksize == 1024 && osb->s_clustersize == 4096))
134 return OCFS2_LA_OLD_DEFAULT;
135
136 /*
137 * Leave enough room for some block groups and make the final
138 * value we work from a multiple of 4.
139 */
140 gd_mb -= 16;
141 gd_mb &= 0xFFFFFFFB;
142
143 la_mb = gd_mb;
144
145 /*
146 * Keep window sizes down to a reasonable default
147 */
148 if (la_mb > OCFS2_LA_MAX_DEFAULT_MB) {
149 /*
150 * Some clustersize / blocksize combinations will have
151 * given us a larger than OCFS2_LA_MAX_DEFAULT_MB
152 * default size, but get poor distribution when
153 * limited to exactly 256 megabytes.
154 *
155 * As an example, 16K clustersize at 4K blocksize
156 * gives us a cluster group size of 504M. Paring the
157 * local alloc size down to 256 however, would give us
158 * only one window and around 200MB left in the
159 * cluster group. Instead, find the first size below
160 * 256 which would give us an even distribution.
161 *
162 * Larger cluster group sizes actually work out pretty
163 * well when pared to 256, so we don't have to do this
164 * for any group that fits more than two
165 * OCFS2_LA_MAX_DEFAULT_MB windows.
166 */
167 if (gd_mb > (2 * OCFS2_LA_MAX_DEFAULT_MB))
168 la_mb = 256;
169 else {
170 unsigned int gd_mult = gd_mb;
171
172 while (gd_mult > 256)
173 gd_mult = gd_mult >> 1;
174
175 la_mb = gd_mult;
176 }
177 }
178
179 megs_per_slot = osb->osb_clusters_at_boot / osb->max_slots;
180 megs_per_slot = ocfs2_clusters_to_megabytes(osb->sb, megs_per_slot);
181 /* Too many nodes, too few disk clusters. */
182 if (megs_per_slot < la_mb)
183 la_mb = megs_per_slot;
184
185 return la_mb;
186}
187
188void ocfs2_la_set_sizes(struct ocfs2_super *osb, int requested_mb)
189{
190 struct super_block *sb = osb->sb;
191 unsigned int la_default_mb = ocfs2_la_default_mb(osb);
192 unsigned int la_max_mb;
193
194 la_max_mb = ocfs2_clusters_to_megabytes(sb,
195 ocfs2_local_alloc_size(sb) * 8);
196
197 mlog(0, "requested: %dM, max: %uM, default: %uM\n",
198 requested_mb, la_max_mb, la_default_mb);
199
200 if (requested_mb == -1) {
201 /* No user request - use defaults */
202 osb->local_alloc_default_bits =
203 ocfs2_megabytes_to_clusters(sb, la_default_mb);
204 } else if (requested_mb > la_max_mb) {
205 /* Request is too big, we give the maximum available */
206 osb->local_alloc_default_bits =
207 ocfs2_megabytes_to_clusters(sb, la_max_mb);
208 } else {
209 osb->local_alloc_default_bits =
210 ocfs2_megabytes_to_clusters(sb, requested_mb);
211 }
212
213 osb->local_alloc_bits = osb->local_alloc_default_bits;
214}
215
77static inline int ocfs2_la_state_enabled(struct ocfs2_super *osb) 216static inline int ocfs2_la_state_enabled(struct ocfs2_super *osb)
78{ 217{
79 return (osb->local_alloc_state == OCFS2_LA_THROTTLED || 218 return (osb->local_alloc_state == OCFS2_LA_THROTTLED ||
@@ -156,7 +295,7 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
156 osb->local_alloc_bits, (osb->bitmap_cpg - 1)); 295 osb->local_alloc_bits, (osb->bitmap_cpg - 1));
157 osb->local_alloc_bits = 296 osb->local_alloc_bits =
158 ocfs2_megabytes_to_clusters(osb->sb, 297 ocfs2_megabytes_to_clusters(osb->sb,
159 OCFS2_DEFAULT_LOCAL_ALLOC_SIZE); 298 ocfs2_la_default_mb(osb));
160 } 299 }
161 300
162 /* read the alloc off disk */ 301 /* read the alloc off disk */
@@ -262,6 +401,8 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
262 401
263 osb->local_alloc_state = OCFS2_LA_DISABLED; 402 osb->local_alloc_state = OCFS2_LA_DISABLED;
264 403
404 ocfs2_resmap_uninit(&osb->osb_la_resmap);
405
265 main_bm_inode = ocfs2_get_system_file_inode(osb, 406 main_bm_inode = ocfs2_get_system_file_inode(osb,
266 GLOBAL_BITMAP_SYSTEM_INODE, 407 GLOBAL_BITMAP_SYSTEM_INODE,
267 OCFS2_INVALID_SLOT); 408 OCFS2_INVALID_SLOT);
@@ -305,12 +446,7 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
305 } 446 }
306 447
307 ocfs2_clear_local_alloc(alloc); 448 ocfs2_clear_local_alloc(alloc);
308 449 ocfs2_journal_dirty(handle, bh);
309 status = ocfs2_journal_dirty(handle, bh);
310 if (status < 0) {
311 mlog_errno(status);
312 goto out_commit;
313 }
314 450
315 brelse(bh); 451 brelse(bh);
316 osb->local_alloc_bh = NULL; 452 osb->local_alloc_bh = NULL;
@@ -481,46 +617,6 @@ out:
481 return status; 617 return status;
482} 618}
483 619
484/* Check to see if the local alloc window is within ac->ac_max_block */
485static int ocfs2_local_alloc_in_range(struct inode *inode,
486 struct ocfs2_alloc_context *ac,
487 u32 bits_wanted)
488{
489 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
490 struct ocfs2_dinode *alloc;
491 struct ocfs2_local_alloc *la;
492 int start;
493 u64 block_off;
494
495 if (!ac->ac_max_block)
496 return 1;
497
498 alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
499 la = OCFS2_LOCAL_ALLOC(alloc);
500
501 start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted);
502 if (start == -1) {
503 mlog_errno(-ENOSPC);
504 return 0;
505 }
506
507 /*
508 * Converting (bm_off + start + bits_wanted) to blocks gives us
509 * the blkno just past our actual allocation. This is perfect
510 * to compare with ac_max_block.
511 */
512 block_off = ocfs2_clusters_to_blocks(inode->i_sb,
513 le32_to_cpu(la->la_bm_off) +
514 start + bits_wanted);
515 mlog(0, "Checking %llu against %llu\n",
516 (unsigned long long)block_off,
517 (unsigned long long)ac->ac_max_block);
518 if (block_off > ac->ac_max_block)
519 return 0;
520
521 return 1;
522}
523
524/* 620/*
525 * make sure we've got at least bits_wanted contiguous bits in the 621 * make sure we've got at least bits_wanted contiguous bits in the
526 * local alloc. You lose them when you drop i_mutex. 622 * local alloc. You lose them when you drop i_mutex.
@@ -613,17 +709,6 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
613 mlog(0, "Calling in_range for max block %llu\n", 709 mlog(0, "Calling in_range for max block %llu\n",
614 (unsigned long long)ac->ac_max_block); 710 (unsigned long long)ac->ac_max_block);
615 711
616 if (!ocfs2_local_alloc_in_range(local_alloc_inode, ac,
617 bits_wanted)) {
618 /*
619 * The window is outside ac->ac_max_block.
620 * This errno tells the caller to keep localalloc enabled
621 * but to get the allocation from the main bitmap.
622 */
623 status = -EFBIG;
624 goto bail;
625 }
626
627 ac->ac_inode = local_alloc_inode; 712 ac->ac_inode = local_alloc_inode;
628 /* We should never use localalloc from another slot */ 713 /* We should never use localalloc from another slot */
629 ac->ac_alloc_slot = osb->slot_num; 714 ac->ac_alloc_slot = osb->slot_num;
@@ -664,7 +749,8 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
664 alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data; 749 alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
665 la = OCFS2_LOCAL_ALLOC(alloc); 750 la = OCFS2_LOCAL_ALLOC(alloc);
666 751
667 start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted); 752 start = ocfs2_local_alloc_find_clear_bits(osb, alloc, &bits_wanted,
753 ac->ac_resv);
668 if (start == -1) { 754 if (start == -1) {
669 /* TODO: Shouldn't we just BUG here? */ 755 /* TODO: Shouldn't we just BUG here? */
670 status = -ENOSPC; 756 status = -ENOSPC;
@@ -674,8 +760,6 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
674 760
675 bitmap = la->la_bitmap; 761 bitmap = la->la_bitmap;
676 *bit_off = le32_to_cpu(la->la_bm_off) + start; 762 *bit_off = le32_to_cpu(la->la_bm_off) + start;
677 /* local alloc is always contiguous by nature -- we never
678 * delete bits from it! */
679 *num_bits = bits_wanted; 763 *num_bits = bits_wanted;
680 764
681 status = ocfs2_journal_access_di(handle, 765 status = ocfs2_journal_access_di(handle,
@@ -687,18 +771,15 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
687 goto bail; 771 goto bail;
688 } 772 }
689 773
774 ocfs2_resmap_claimed_bits(&osb->osb_la_resmap, ac->ac_resv, start,
775 bits_wanted);
776
690 while(bits_wanted--) 777 while(bits_wanted--)
691 ocfs2_set_bit(start++, bitmap); 778 ocfs2_set_bit(start++, bitmap);
692 779
693 le32_add_cpu(&alloc->id1.bitmap1.i_used, *num_bits); 780 le32_add_cpu(&alloc->id1.bitmap1.i_used, *num_bits);
781 ocfs2_journal_dirty(handle, osb->local_alloc_bh);
694 782
695 status = ocfs2_journal_dirty(handle, osb->local_alloc_bh);
696 if (status < 0) {
697 mlog_errno(status);
698 goto bail;
699 }
700
701 status = 0;
702bail: 783bail:
703 mlog_exit(status); 784 mlog_exit(status);
704 return status; 785 return status;
@@ -722,13 +803,17 @@ static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc)
722} 803}
723 804
724static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb, 805static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
725 struct ocfs2_dinode *alloc, 806 struct ocfs2_dinode *alloc,
726 u32 numbits) 807 u32 *numbits,
808 struct ocfs2_alloc_reservation *resv)
727{ 809{
728 int numfound, bitoff, left, startoff, lastzero; 810 int numfound, bitoff, left, startoff, lastzero;
811 int local_resv = 0;
812 struct ocfs2_alloc_reservation r;
729 void *bitmap = NULL; 813 void *bitmap = NULL;
814 struct ocfs2_reservation_map *resmap = &osb->osb_la_resmap;
730 815
731 mlog_entry("(numbits wanted = %u)\n", numbits); 816 mlog_entry("(numbits wanted = %u)\n", *numbits);
732 817
733 if (!alloc->id1.bitmap1.i_total) { 818 if (!alloc->id1.bitmap1.i_total) {
734 mlog(0, "No bits in my window!\n"); 819 mlog(0, "No bits in my window!\n");
@@ -736,6 +821,30 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
736 goto bail; 821 goto bail;
737 } 822 }
738 823
824 if (!resv) {
825 local_resv = 1;
826 ocfs2_resv_init_once(&r);
827 ocfs2_resv_set_type(&r, OCFS2_RESV_FLAG_TMP);
828 resv = &r;
829 }
830
831 numfound = *numbits;
832 if (ocfs2_resmap_resv_bits(resmap, resv, &bitoff, &numfound) == 0) {
833 if (numfound < *numbits)
834 *numbits = numfound;
835 goto bail;
836 }
837
838 /*
839 * Code error. While reservations are enabled, local
840 * allocation should _always_ go through them.
841 */
842 BUG_ON(osb->osb_resv_level != 0);
843
844 /*
845 * Reservations are disabled. Handle this the old way.
846 */
847
739 bitmap = OCFS2_LOCAL_ALLOC(alloc)->la_bitmap; 848 bitmap = OCFS2_LOCAL_ALLOC(alloc)->la_bitmap;
740 849
741 numfound = bitoff = startoff = 0; 850 numfound = bitoff = startoff = 0;
@@ -761,7 +870,7 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
761 startoff = bitoff+1; 870 startoff = bitoff+1;
762 } 871 }
763 /* we got everything we needed */ 872 /* we got everything we needed */
764 if (numfound == numbits) { 873 if (numfound == *numbits) {
765 /* mlog(0, "Found it all!\n"); */ 874 /* mlog(0, "Found it all!\n"); */
766 break; 875 break;
767 } 876 }
@@ -770,12 +879,15 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
770 mlog(0, "Exiting loop, bitoff = %d, numfound = %d\n", bitoff, 879 mlog(0, "Exiting loop, bitoff = %d, numfound = %d\n", bitoff,
771 numfound); 880 numfound);
772 881
773 if (numfound == numbits) 882 if (numfound == *numbits)
774 bitoff = startoff - numfound; 883 bitoff = startoff - numfound;
775 else 884 else
776 bitoff = -1; 885 bitoff = -1;
777 886
778bail: 887bail:
888 if (local_resv)
889 ocfs2_resv_discard(resmap, resv);
890
779 mlog_exit(bitoff); 891 mlog_exit(bitoff);
780 return bitoff; 892 return bitoff;
781} 893}
@@ -872,8 +984,10 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
872 (unsigned long long)la_start_blk, 984 (unsigned long long)la_start_blk,
873 (unsigned long long)blkno); 985 (unsigned long long)blkno);
874 986
875 status = ocfs2_free_clusters(handle, main_bm_inode, 987 status = ocfs2_release_clusters(handle,
876 main_bm_bh, blkno, count); 988 main_bm_inode,
989 main_bm_bh, blkno,
990 count);
877 if (status < 0) { 991 if (status < 0) {
878 mlog_errno(status); 992 mlog_errno(status);
879 goto bail; 993 goto bail;
@@ -984,8 +1098,7 @@ static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
984 } 1098 }
985 1099
986retry_enospc: 1100retry_enospc:
987 (*ac)->ac_bits_wanted = osb->local_alloc_bits; 1101 (*ac)->ac_bits_wanted = osb->local_alloc_default_bits;
988
989 status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac); 1102 status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
990 if (status == -ENOSPC) { 1103 if (status == -ENOSPC) {
991 if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_ENOSPC) == 1104 if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_ENOSPC) ==
@@ -1048,7 +1161,7 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
1048 /* we used the generic suballoc reserve function, but we set 1161 /* we used the generic suballoc reserve function, but we set
1049 * everything up nicely, so there's no reason why we can't use 1162 * everything up nicely, so there's no reason why we can't use
1050 * the more specific cluster api to claim bits. */ 1163 * the more specific cluster api to claim bits. */
1051 status = ocfs2_claim_clusters(osb, handle, ac, osb->local_alloc_bits, 1164 status = ocfs2_claim_clusters(handle, ac, osb->local_alloc_bits,
1052 &cluster_off, &cluster_count); 1165 &cluster_off, &cluster_count);
1053 if (status == -ENOSPC) { 1166 if (status == -ENOSPC) {
1054retry_enospc: 1167retry_enospc:
@@ -1061,7 +1174,8 @@ retry_enospc:
1061 OCFS2_LA_DISABLED) 1174 OCFS2_LA_DISABLED)
1062 goto bail; 1175 goto bail;
1063 1176
1064 status = ocfs2_claim_clusters(osb, handle, ac, 1177 ac->ac_bits_wanted = osb->local_alloc_default_bits;
1178 status = ocfs2_claim_clusters(handle, ac,
1065 osb->local_alloc_bits, 1179 osb->local_alloc_bits,
1066 &cluster_off, 1180 &cluster_off,
1067 &cluster_count); 1181 &cluster_count);
@@ -1096,6 +1210,9 @@ retry_enospc:
1096 memset(OCFS2_LOCAL_ALLOC(alloc)->la_bitmap, 0, 1210 memset(OCFS2_LOCAL_ALLOC(alloc)->la_bitmap, 0,
1097 le16_to_cpu(la->la_size)); 1211 le16_to_cpu(la->la_size));
1098 1212
1213 ocfs2_resmap_restart(&osb->osb_la_resmap, cluster_count,
1214 OCFS2_LOCAL_ALLOC(alloc)->la_bitmap);
1215
1099 mlog(0, "New window allocated:\n"); 1216 mlog(0, "New window allocated:\n");
1100 mlog(0, "window la_bm_off = %u\n", 1217 mlog(0, "window la_bm_off = %u\n",
1101 OCFS2_LOCAL_ALLOC(alloc)->la_bm_off); 1218 OCFS2_LOCAL_ALLOC(alloc)->la_bm_off);
@@ -1167,12 +1284,7 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
1167 } 1284 }
1168 1285
1169 ocfs2_clear_local_alloc(alloc); 1286 ocfs2_clear_local_alloc(alloc);
1170 1287 ocfs2_journal_dirty(handle, osb->local_alloc_bh);
1171 status = ocfs2_journal_dirty(handle, osb->local_alloc_bh);
1172 if (status < 0) {
1173 mlog_errno(status);
1174 goto bail;
1175 }
1176 1288
1177 status = ocfs2_sync_local_to_main(osb, handle, alloc_copy, 1289 status = ocfs2_sync_local_to_main(osb, handle, alloc_copy,
1178 main_bm_inode, main_bm_bh); 1290 main_bm_inode, main_bm_bh);
@@ -1190,7 +1302,6 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
1190 1302
1191 atomic_inc(&osb->alloc_stats.moves); 1303 atomic_inc(&osb->alloc_stats.moves);
1192 1304
1193 status = 0;
1194bail: 1305bail:
1195 if (handle) 1306 if (handle)
1196 ocfs2_commit_trans(osb, handle); 1307 ocfs2_commit_trans(osb, handle);
diff --git a/fs/ocfs2/localalloc.h b/fs/ocfs2/localalloc.h
index ac5ea9f86653..1be9b5864460 100644
--- a/fs/ocfs2/localalloc.h
+++ b/fs/ocfs2/localalloc.h
@@ -30,6 +30,9 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb);
30 30
31void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb); 31void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb);
32 32
33void ocfs2_la_set_sizes(struct ocfs2_super *osb, int requested_mb);
34unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb);
35
33int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb, 36int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
34 int node_num, 37 int node_num,
35 struct ocfs2_dinode **alloc_copy); 38 struct ocfs2_dinode **alloc_copy);
diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c
index 544ac6245175..b5cb3ede9408 100644
--- a/fs/ocfs2/locks.c
+++ b/fs/ocfs2/locks.c
@@ -133,7 +133,7 @@ int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl)
133 133
134 if (!(fl->fl_flags & FL_POSIX)) 134 if (!(fl->fl_flags & FL_POSIX))
135 return -ENOLCK; 135 return -ENOLCK;
136 if (__mandatory_lock(inode)) 136 if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
137 return -ENOLCK; 137 return -ENOLCK;
138 138
139 return ocfs2_plock(osb->cconn, OCFS2_I(inode)->ip_blkno, file, cmd, fl); 139 return ocfs2_plock(osb->cconn, OCFS2_I(inode)->ip_blkno, file, cmd, fl);
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 39737613424a..af2b8fe1f139 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -25,7 +25,6 @@
25 25
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/types.h> 27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h> 28#include <linux/highmem.h>
30#include <linux/pagemap.h> 29#include <linux/pagemap.h>
31#include <linux/uio.h> 30#include <linux/uio.h>
@@ -42,44 +41,20 @@
42#include "file.h" 41#include "file.h"
43#include "inode.h" 42#include "inode.h"
44#include "mmap.h" 43#include "mmap.h"
44#include "super.h"
45 45
46static inline int ocfs2_vm_op_block_sigs(sigset_t *blocked, sigset_t *oldset)
47{
48 /* The best way to deal with signals in the vm path is
49 * to block them upfront, rather than allowing the
50 * locking paths to return -ERESTARTSYS. */
51 sigfillset(blocked);
52
53 /* We should technically never get a bad return value
54 * from sigprocmask */
55 return sigprocmask(SIG_BLOCK, blocked, oldset);
56}
57
58static inline int ocfs2_vm_op_unblock_sigs(sigset_t *oldset)
59{
60 return sigprocmask(SIG_SETMASK, oldset, NULL);
61}
62 46
63static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf) 47static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf)
64{ 48{
65 sigset_t blocked, oldset; 49 sigset_t oldset;
66 int error, ret; 50 int ret;
67 51
68 mlog_entry("(area=%p, page offset=%lu)\n", area, vmf->pgoff); 52 mlog_entry("(area=%p, page offset=%lu)\n", area, vmf->pgoff);
69 53
70 error = ocfs2_vm_op_block_sigs(&blocked, &oldset); 54 ocfs2_block_signals(&oldset);
71 if (error < 0) {
72 mlog_errno(error);
73 ret = VM_FAULT_SIGBUS;
74 goto out;
75 }
76
77 ret = filemap_fault(area, vmf); 55 ret = filemap_fault(area, vmf);
56 ocfs2_unblock_signals(&oldset);
78 57
79 error = ocfs2_vm_op_unblock_sigs(&oldset);
80 if (error < 0)
81 mlog_errno(error);
82out:
83 mlog_exit_ptr(vmf->page); 58 mlog_exit_ptr(vmf->page);
84 return ret; 59 return ret;
85} 60}
@@ -159,14 +134,10 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
159 struct page *page = vmf->page; 134 struct page *page = vmf->page;
160 struct inode *inode = vma->vm_file->f_path.dentry->d_inode; 135 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
161 struct buffer_head *di_bh = NULL; 136 struct buffer_head *di_bh = NULL;
162 sigset_t blocked, oldset; 137 sigset_t oldset;
163 int ret, ret2; 138 int ret;
164 139
165 ret = ocfs2_vm_op_block_sigs(&blocked, &oldset); 140 ocfs2_block_signals(&oldset);
166 if (ret < 0) {
167 mlog_errno(ret);
168 return ret;
169 }
170 141
171 /* 142 /*
172 * The cluster locks taken will block a truncate from another 143 * The cluster locks taken will block a truncate from another
@@ -194,9 +165,7 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
194 ocfs2_inode_unlock(inode, 1); 165 ocfs2_inode_unlock(inode, 1);
195 166
196out: 167out:
197 ret2 = ocfs2_vm_op_unblock_sigs(&oldset); 168 ocfs2_unblock_signals(&oldset);
198 if (ret2 < 0)
199 mlog_errno(ret2);
200 if (ret) 169 if (ret)
201 ret = VM_FAULT_SIGBUS; 170 ret = VM_FAULT_SIGBUS;
202 return ret; 171 return ret;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index d9cd4e373a53..f171b51a74f7 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -84,7 +84,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
84static int ocfs2_orphan_add(struct ocfs2_super *osb, 84static int ocfs2_orphan_add(struct ocfs2_super *osb,
85 handle_t *handle, 85 handle_t *handle,
86 struct inode *inode, 86 struct inode *inode,
87 struct ocfs2_dinode *fe, 87 struct buffer_head *fe_bh,
88 char *name, 88 char *name,
89 struct ocfs2_dir_lookup_result *lookup, 89 struct ocfs2_dir_lookup_result *lookup,
90 struct inode *orphan_dir_inode); 90 struct inode *orphan_dir_inode);
@@ -204,14 +204,7 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, int mode)
204 inode->i_nlink = 2; 204 inode->i_nlink = 2;
205 else 205 else
206 inode->i_nlink = 1; 206 inode->i_nlink = 1;
207 inode->i_uid = current_fsuid(); 207 inode_init_owner(inode, dir, mode);
208 if (dir->i_mode & S_ISGID) {
209 inode->i_gid = dir->i_gid;
210 if (S_ISDIR(mode))
211 mode |= S_ISGID;
212 } else
213 inode->i_gid = current_fsgid();
214 inode->i_mode = mode;
215 dquot_initialize(inode); 208 dquot_initialize(inode);
216 return inode; 209 return inode;
217} 210}
@@ -239,6 +232,8 @@ static int ocfs2_mknod(struct inode *dir,
239 }; 232 };
240 int did_quota_inode = 0; 233 int did_quota_inode = 0;
241 struct ocfs2_dir_lookup_result lookup = { NULL, }; 234 struct ocfs2_dir_lookup_result lookup = { NULL, };
235 sigset_t oldset;
236 int did_block_signals = 0;
242 237
243 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode, 238 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
244 (unsigned long)dev, dentry->d_name.len, 239 (unsigned long)dev, dentry->d_name.len,
@@ -350,6 +345,10 @@ static int ocfs2_mknod(struct inode *dir,
350 goto leave; 345 goto leave;
351 } 346 }
352 347
348 /* Starting to change things, restart is no longer possible. */
349 ocfs2_block_signals(&oldset);
350 did_block_signals = 1;
351
353 status = dquot_alloc_inode(inode); 352 status = dquot_alloc_inode(inode);
354 if (status) 353 if (status)
355 goto leave; 354 goto leave;
@@ -384,11 +383,7 @@ static int ocfs2_mknod(struct inode *dir,
384 goto leave; 383 goto leave;
385 } 384 }
386 ocfs2_add_links_count(dirfe, 1); 385 ocfs2_add_links_count(dirfe, 1);
387 status = ocfs2_journal_dirty(handle, parent_fe_bh); 386 ocfs2_journal_dirty(handle, parent_fe_bh);
388 if (status < 0) {
389 mlog_errno(status);
390 goto leave;
391 }
392 inc_nlink(dir); 387 inc_nlink(dir);
393 } 388 }
394 389
@@ -408,23 +403,28 @@ static int ocfs2_mknod(struct inode *dir,
408 } 403 }
409 } 404 }
410 405
411 status = ocfs2_add_entry(handle, dentry, inode, 406 /*
412 OCFS2_I(inode)->ip_blkno, parent_fe_bh, 407 * Do this before adding the entry to the directory. We add
413 &lookup); 408 * also set d_op after success so that ->d_iput() will cleanup
414 if (status < 0) { 409 * the dentry lock even if ocfs2_add_entry() fails below.
410 */
411 status = ocfs2_dentry_attach_lock(dentry, inode,
412 OCFS2_I(dir)->ip_blkno);
413 if (status) {
415 mlog_errno(status); 414 mlog_errno(status);
416 goto leave; 415 goto leave;
417 } 416 }
417 dentry->d_op = &ocfs2_dentry_ops;
418 418
419 status = ocfs2_dentry_attach_lock(dentry, inode, 419 status = ocfs2_add_entry(handle, dentry, inode,
420 OCFS2_I(dir)->ip_blkno); 420 OCFS2_I(inode)->ip_blkno, parent_fe_bh,
421 if (status) { 421 &lookup);
422 if (status < 0) {
422 mlog_errno(status); 423 mlog_errno(status);
423 goto leave; 424 goto leave;
424 } 425 }
425 426
426 insert_inode_hash(inode); 427 insert_inode_hash(inode);
427 dentry->d_op = &ocfs2_dentry_ops;
428 d_instantiate(dentry, inode); 428 d_instantiate(dentry, inode);
429 status = 0; 429 status = 0;
430leave: 430leave:
@@ -434,6 +434,8 @@ leave:
434 ocfs2_commit_trans(osb, handle); 434 ocfs2_commit_trans(osb, handle);
435 435
436 ocfs2_inode_unlock(dir, 1); 436 ocfs2_inode_unlock(dir, 1);
437 if (did_block_signals)
438 ocfs2_unblock_signals(&oldset);
437 439
438 if (status == -ENOSPC) 440 if (status == -ENOSPC)
439 mlog(0, "Disk is full\n"); 441 mlog(0, "Disk is full\n");
@@ -445,11 +447,6 @@ leave:
445 447
446 ocfs2_free_dir_lookup_result(&lookup); 448 ocfs2_free_dir_lookup_result(&lookup);
447 449
448 if ((status < 0) && inode) {
449 clear_nlink(inode);
450 iput(inode);
451 }
452
453 if (inode_ac) 450 if (inode_ac)
454 ocfs2_free_alloc_context(inode_ac); 451 ocfs2_free_alloc_context(inode_ac);
455 452
@@ -459,6 +456,17 @@ leave:
459 if (meta_ac) 456 if (meta_ac)
460 ocfs2_free_alloc_context(meta_ac); 457 ocfs2_free_alloc_context(meta_ac);
461 458
459 /*
460 * We should call iput after the i_mutex of the bitmap been
461 * unlocked in ocfs2_free_alloc_context, or the
462 * ocfs2_delete_inode will mutex_lock again.
463 */
464 if ((status < 0) && inode) {
465 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR;
466 clear_nlink(inode);
467 iput(inode);
468 }
469
462 mlog_exit(status); 470 mlog_exit(status);
463 471
464 return status; 472 return status;
@@ -476,14 +484,15 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
476 int status = 0; 484 int status = 0;
477 struct ocfs2_dinode *fe = NULL; 485 struct ocfs2_dinode *fe = NULL;
478 struct ocfs2_extent_list *fel; 486 struct ocfs2_extent_list *fel;
479 u64 fe_blkno = 0; 487 u64 suballoc_loc, fe_blkno = 0;
480 u16 suballoc_bit; 488 u16 suballoc_bit;
481 u16 feat; 489 u16 feat;
482 490
483 *new_fe_bh = NULL; 491 *new_fe_bh = NULL;
484 492
485 status = ocfs2_claim_new_inode(osb, handle, dir, parent_fe_bh, 493 status = ocfs2_claim_new_inode(handle, dir, parent_fe_bh,
486 inode_ac, &suballoc_bit, &fe_blkno); 494 inode_ac, &suballoc_loc,
495 &suballoc_bit, &fe_blkno);
487 if (status < 0) { 496 if (status < 0) {
488 mlog_errno(status); 497 mlog_errno(status);
489 goto leave; 498 goto leave;
@@ -520,6 +529,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
520 fe->i_generation = cpu_to_le32(inode->i_generation); 529 fe->i_generation = cpu_to_le32(inode->i_generation);
521 fe->i_fs_generation = cpu_to_le32(osb->fs_generation); 530 fe->i_fs_generation = cpu_to_le32(osb->fs_generation);
522 fe->i_blkno = cpu_to_le64(fe_blkno); 531 fe->i_blkno = cpu_to_le64(fe_blkno);
532 fe->i_suballoc_loc = cpu_to_le64(suballoc_loc);
523 fe->i_suballoc_bit = cpu_to_le16(suballoc_bit); 533 fe->i_suballoc_bit = cpu_to_le16(suballoc_bit);
524 fe->i_suballoc_slot = cpu_to_le16(inode_ac->ac_alloc_slot); 534 fe->i_suballoc_slot = cpu_to_le16(inode_ac->ac_alloc_slot);
525 fe->i_uid = cpu_to_le32(inode->i_uid); 535 fe->i_uid = cpu_to_le32(inode->i_uid);
@@ -556,11 +566,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
556 fel->l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(osb->sb)); 566 fel->l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(osb->sb));
557 } 567 }
558 568
559 status = ocfs2_journal_dirty(handle, *new_fe_bh); 569 ocfs2_journal_dirty(handle, *new_fe_bh);
560 if (status < 0) {
561 mlog_errno(status);
562 goto leave;
563 }
564 570
565 ocfs2_populate_inode(inode, fe, 1); 571 ocfs2_populate_inode(inode, fe, 1);
566 ocfs2_ci_set_new(osb, INODE_CACHE(inode)); 572 ocfs2_ci_set_new(osb, INODE_CACHE(inode));
@@ -626,6 +632,7 @@ static int ocfs2_link(struct dentry *old_dentry,
626 struct ocfs2_dinode *fe = NULL; 632 struct ocfs2_dinode *fe = NULL;
627 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); 633 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
628 struct ocfs2_dir_lookup_result lookup = { NULL, }; 634 struct ocfs2_dir_lookup_result lookup = { NULL, };
635 sigset_t oldset;
629 636
630 mlog_entry("(inode=%lu, old='%.*s' new='%.*s')\n", inode->i_ino, 637 mlog_entry("(inode=%lu, old='%.*s' new='%.*s')\n", inode->i_ino,
631 old_dentry->d_name.len, old_dentry->d_name.name, 638 old_dentry->d_name.len, old_dentry->d_name.name,
@@ -682,6 +689,9 @@ static int ocfs2_link(struct dentry *old_dentry,
682 goto out_unlock_inode; 689 goto out_unlock_inode;
683 } 690 }
684 691
692 /* Starting to change things, restart is no longer possible. */
693 ocfs2_block_signals(&oldset);
694
685 err = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh, 695 err = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh,
686 OCFS2_JOURNAL_ACCESS_WRITE); 696 OCFS2_JOURNAL_ACCESS_WRITE);
687 if (err < 0) { 697 if (err < 0) {
@@ -694,14 +704,7 @@ static int ocfs2_link(struct dentry *old_dentry,
694 ocfs2_set_links_count(fe, inode->i_nlink); 704 ocfs2_set_links_count(fe, inode->i_nlink);
695 fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); 705 fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
696 fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); 706 fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
697 707 ocfs2_journal_dirty(handle, fe_bh);
698 err = ocfs2_journal_dirty(handle, fe_bh);
699 if (err < 0) {
700 ocfs2_add_links_count(fe, -1);
701 drop_nlink(inode);
702 mlog_errno(err);
703 goto out_commit;
704 }
705 708
706 err = ocfs2_add_entry(handle, dentry, inode, 709 err = ocfs2_add_entry(handle, dentry, inode,
707 OCFS2_I(inode)->ip_blkno, 710 OCFS2_I(inode)->ip_blkno,
@@ -725,6 +728,7 @@ static int ocfs2_link(struct dentry *old_dentry,
725 728
726out_commit: 729out_commit:
727 ocfs2_commit_trans(osb, handle); 730 ocfs2_commit_trans(osb, handle);
731 ocfs2_unblock_signals(&oldset);
728out_unlock_inode: 732out_unlock_inode:
729 ocfs2_inode_unlock(inode, 1); 733 ocfs2_inode_unlock(inode, 1);
730 734
@@ -879,7 +883,7 @@ static int ocfs2_unlink(struct inode *dir,
879 fe = (struct ocfs2_dinode *) fe_bh->b_data; 883 fe = (struct ocfs2_dinode *) fe_bh->b_data;
880 884
881 if (inode_is_unlinkable(inode)) { 885 if (inode_is_unlinkable(inode)) {
882 status = ocfs2_orphan_add(osb, handle, inode, fe, orphan_name, 886 status = ocfs2_orphan_add(osb, handle, inode, fe_bh, orphan_name,
883 &orphan_insert, orphan_dir); 887 &orphan_insert, orphan_dir);
884 if (status < 0) { 888 if (status < 0) {
885 mlog_errno(status); 889 mlog_errno(status);
@@ -898,12 +902,7 @@ static int ocfs2_unlink(struct inode *dir,
898 drop_nlink(inode); 902 drop_nlink(inode);
899 drop_nlink(inode); 903 drop_nlink(inode);
900 ocfs2_set_links_count(fe, inode->i_nlink); 904 ocfs2_set_links_count(fe, inode->i_nlink);
901 905 ocfs2_journal_dirty(handle, fe_bh);
902 status = ocfs2_journal_dirty(handle, fe_bh);
903 if (status < 0) {
904 mlog_errno(status);
905 goto leave;
906 }
907 906
908 dir->i_ctime = dir->i_mtime = CURRENT_TIME; 907 dir->i_ctime = dir->i_mtime = CURRENT_TIME;
909 if (S_ISDIR(inode->i_mode)) 908 if (S_ISDIR(inode->i_mode))
@@ -1300,7 +1299,7 @@ static int ocfs2_rename(struct inode *old_dir,
1300 if (S_ISDIR(new_inode->i_mode) || 1299 if (S_ISDIR(new_inode->i_mode) ||
1301 (ocfs2_read_links_count(newfe) == 1)) { 1300 (ocfs2_read_links_count(newfe) == 1)) {
1302 status = ocfs2_orphan_add(osb, handle, new_inode, 1301 status = ocfs2_orphan_add(osb, handle, new_inode,
1303 newfe, orphan_name, 1302 newfe_bh, orphan_name,
1304 &orphan_insert, orphan_dir); 1303 &orphan_insert, orphan_dir);
1305 if (status < 0) { 1304 if (status < 0) {
1306 mlog_errno(status); 1305 mlog_errno(status);
@@ -1321,12 +1320,7 @@ static int ocfs2_rename(struct inode *old_dir,
1321 ocfs2_set_links_count(newfe, 0); 1320 ocfs2_set_links_count(newfe, 0);
1322 else 1321 else
1323 ocfs2_add_links_count(newfe, -1); 1322 ocfs2_add_links_count(newfe, -1);
1324 1323 ocfs2_journal_dirty(handle, newfe_bh);
1325 status = ocfs2_journal_dirty(handle, newfe_bh);
1326 if (status < 0) {
1327 mlog_errno(status);
1328 goto bail;
1329 }
1330 } else { 1324 } else {
1331 /* if the name was not found in new_dir, add it now */ 1325 /* if the name was not found in new_dir, add it now */
1332 status = ocfs2_add_entry(handle, new_dentry, old_inode, 1326 status = ocfs2_add_entry(handle, new_dentry, old_inode,
@@ -1345,10 +1339,7 @@ static int ocfs2_rename(struct inode *old_dir,
1345 1339
1346 old_di->i_ctime = cpu_to_le64(old_inode->i_ctime.tv_sec); 1340 old_di->i_ctime = cpu_to_le64(old_inode->i_ctime.tv_sec);
1347 old_di->i_ctime_nsec = cpu_to_le32(old_inode->i_ctime.tv_nsec); 1341 old_di->i_ctime_nsec = cpu_to_le32(old_inode->i_ctime.tv_nsec);
1348 1342 ocfs2_journal_dirty(handle, old_inode_bh);
1349 status = ocfs2_journal_dirty(handle, old_inode_bh);
1350 if (status < 0)
1351 mlog_errno(status);
1352 } else 1343 } else
1353 mlog_errno(status); 1344 mlog_errno(status);
1354 1345
@@ -1420,7 +1411,7 @@ static int ocfs2_rename(struct inode *old_dir,
1420 OCFS2_JOURNAL_ACCESS_WRITE); 1411 OCFS2_JOURNAL_ACCESS_WRITE);
1421 fe = (struct ocfs2_dinode *) old_dir_bh->b_data; 1412 fe = (struct ocfs2_dinode *) old_dir_bh->b_data;
1422 ocfs2_set_links_count(fe, old_dir->i_nlink); 1413 ocfs2_set_links_count(fe, old_dir->i_nlink);
1423 status = ocfs2_journal_dirty(handle, old_dir_bh); 1414 ocfs2_journal_dirty(handle, old_dir_bh);
1424 } 1415 }
1425 } 1416 }
1426 ocfs2_dentry_move(old_dentry, new_dentry, old_dir, new_dir); 1417 ocfs2_dentry_move(old_dentry, new_dentry, old_dir, new_dir);
@@ -1552,11 +1543,7 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
1552 (bytes_left > sb->s_blocksize) ? sb->s_blocksize : 1543 (bytes_left > sb->s_blocksize) ? sb->s_blocksize :
1553 bytes_left); 1544 bytes_left);
1554 1545
1555 status = ocfs2_journal_dirty(handle, bhs[virtual]); 1546 ocfs2_journal_dirty(handle, bhs[virtual]);
1556 if (status < 0) {
1557 mlog_errno(status);
1558 goto bail;
1559 }
1560 1547
1561 virtual++; 1548 virtual++;
1562 p_blkno++; 1549 p_blkno++;
@@ -1600,6 +1587,8 @@ static int ocfs2_symlink(struct inode *dir,
1600 }; 1587 };
1601 int did_quota = 0, did_quota_inode = 0; 1588 int did_quota = 0, did_quota_inode = 0;
1602 struct ocfs2_dir_lookup_result lookup = { NULL, }; 1589 struct ocfs2_dir_lookup_result lookup = { NULL, };
1590 sigset_t oldset;
1591 int did_block_signals = 0;
1603 1592
1604 mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir, 1593 mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir,
1605 dentry, symname, dentry->d_name.len, dentry->d_name.name); 1594 dentry, symname, dentry->d_name.len, dentry->d_name.name);
@@ -1695,6 +1684,10 @@ static int ocfs2_symlink(struct inode *dir,
1695 goto bail; 1684 goto bail;
1696 } 1685 }
1697 1686
1687 /* Starting to change things, restart is no longer possible. */
1688 ocfs2_block_signals(&oldset);
1689 did_block_signals = 1;
1690
1698 status = dquot_alloc_inode(inode); 1691 status = dquot_alloc_inode(inode);
1699 if (status) 1692 if (status)
1700 goto bail; 1693 goto bail;
@@ -1771,22 +1764,27 @@ static int ocfs2_symlink(struct inode *dir,
1771 } 1764 }
1772 } 1765 }
1773 1766
1774 status = ocfs2_add_entry(handle, dentry, inode, 1767 /*
1775 le64_to_cpu(fe->i_blkno), parent_fe_bh, 1768 * Do this before adding the entry to the directory. We add
1776 &lookup); 1769 * also set d_op after success so that ->d_iput() will cleanup
1777 if (status < 0) { 1770 * the dentry lock even if ocfs2_add_entry() fails below.
1771 */
1772 status = ocfs2_dentry_attach_lock(dentry, inode, OCFS2_I(dir)->ip_blkno);
1773 if (status) {
1778 mlog_errno(status); 1774 mlog_errno(status);
1779 goto bail; 1775 goto bail;
1780 } 1776 }
1777 dentry->d_op = &ocfs2_dentry_ops;
1781 1778
1782 status = ocfs2_dentry_attach_lock(dentry, inode, OCFS2_I(dir)->ip_blkno); 1779 status = ocfs2_add_entry(handle, dentry, inode,
1783 if (status) { 1780 le64_to_cpu(fe->i_blkno), parent_fe_bh,
1781 &lookup);
1782 if (status < 0) {
1784 mlog_errno(status); 1783 mlog_errno(status);
1785 goto bail; 1784 goto bail;
1786 } 1785 }
1787 1786
1788 insert_inode_hash(inode); 1787 insert_inode_hash(inode);
1789 dentry->d_op = &ocfs2_dentry_ops;
1790 d_instantiate(dentry, inode); 1788 d_instantiate(dentry, inode);
1791bail: 1789bail:
1792 if (status < 0 && did_quota) 1790 if (status < 0 && did_quota)
@@ -1798,6 +1796,8 @@ bail:
1798 ocfs2_commit_trans(osb, handle); 1796 ocfs2_commit_trans(osb, handle);
1799 1797
1800 ocfs2_inode_unlock(dir, 1); 1798 ocfs2_inode_unlock(dir, 1);
1799 if (did_block_signals)
1800 ocfs2_unblock_signals(&oldset);
1801 1801
1802 brelse(new_fe_bh); 1802 brelse(new_fe_bh);
1803 brelse(parent_fe_bh); 1803 brelse(parent_fe_bh);
@@ -1811,6 +1811,7 @@ bail:
1811 if (xattr_ac) 1811 if (xattr_ac)
1812 ocfs2_free_alloc_context(xattr_ac); 1812 ocfs2_free_alloc_context(xattr_ac);
1813 if ((status < 0) && inode) { 1813 if ((status < 0) && inode) {
1814 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR;
1814 clear_nlink(inode); 1815 clear_nlink(inode);
1815 iput(inode); 1816 iput(inode);
1816 } 1817 }
@@ -1911,7 +1912,7 @@ leave:
1911static int ocfs2_orphan_add(struct ocfs2_super *osb, 1912static int ocfs2_orphan_add(struct ocfs2_super *osb,
1912 handle_t *handle, 1913 handle_t *handle,
1913 struct inode *inode, 1914 struct inode *inode,
1914 struct ocfs2_dinode *fe, 1915 struct buffer_head *fe_bh,
1915 char *name, 1916 char *name,
1916 struct ocfs2_dir_lookup_result *lookup, 1917 struct ocfs2_dir_lookup_result *lookup,
1917 struct inode *orphan_dir_inode) 1918 struct inode *orphan_dir_inode)
@@ -1919,6 +1920,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
1919 struct buffer_head *orphan_dir_bh = NULL; 1920 struct buffer_head *orphan_dir_bh = NULL;
1920 int status = 0; 1921 int status = 0;
1921 struct ocfs2_dinode *orphan_fe; 1922 struct ocfs2_dinode *orphan_fe;
1923 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
1922 1924
1923 mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino); 1925 mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
1924 1926
@@ -1943,29 +1945,42 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
1943 if (S_ISDIR(inode->i_mode)) 1945 if (S_ISDIR(inode->i_mode))
1944 ocfs2_add_links_count(orphan_fe, 1); 1946 ocfs2_add_links_count(orphan_fe, 1);
1945 orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe); 1947 orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe);
1948 ocfs2_journal_dirty(handle, orphan_dir_bh);
1946 1949
1947 status = ocfs2_journal_dirty(handle, orphan_dir_bh); 1950 status = __ocfs2_add_entry(handle, orphan_dir_inode, name,
1951 OCFS2_ORPHAN_NAMELEN, inode,
1952 OCFS2_I(inode)->ip_blkno,
1953 orphan_dir_bh, lookup);
1948 if (status < 0) { 1954 if (status < 0) {
1949 mlog_errno(status); 1955 mlog_errno(status);
1950 goto leave; 1956 goto leave;
1951 } 1957 }
1952 1958
1953 status = __ocfs2_add_entry(handle, orphan_dir_inode, name, 1959 /*
1954 OCFS2_ORPHAN_NAMELEN, inode, 1960 * We're going to journal the change of i_flags and i_orphaned_slot.
1955 OCFS2_I(inode)->ip_blkno, 1961 * It's safe anyway, though some callers may duplicate the journaling.
1956 orphan_dir_bh, lookup); 1962 * Journaling within the func just make the logic look more
1963 * straightforward.
1964 */
1965 status = ocfs2_journal_access_di(handle,
1966 INODE_CACHE(inode),
1967 fe_bh,
1968 OCFS2_JOURNAL_ACCESS_WRITE);
1957 if (status < 0) { 1969 if (status < 0) {
1958 mlog_errno(status); 1970 mlog_errno(status);
1959 goto leave; 1971 goto leave;
1960 } 1972 }
1961 1973
1962 le32_add_cpu(&fe->i_flags, OCFS2_ORPHANED_FL); 1974 le32_add_cpu(&fe->i_flags, OCFS2_ORPHANED_FL);
1975 OCFS2_I(inode)->ip_flags &= ~OCFS2_INODE_SKIP_ORPHAN_DIR;
1963 1976
1964 /* Record which orphan dir our inode now resides 1977 /* Record which orphan dir our inode now resides
1965 * in. delete_inode will use this to determine which orphan 1978 * in. delete_inode will use this to determine which orphan
1966 * dir to lock. */ 1979 * dir to lock. */
1967 fe->i_orphaned_slot = cpu_to_le16(osb->slot_num); 1980 fe->i_orphaned_slot = cpu_to_le16(osb->slot_num);
1968 1981
1982 ocfs2_journal_dirty(handle, fe_bh);
1983
1969 mlog(0, "Inode %llu orphaned in slot %d\n", 1984 mlog(0, "Inode %llu orphaned in slot %d\n",
1970 (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num); 1985 (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num);
1971 1986
@@ -2029,12 +2044,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
2029 if (S_ISDIR(inode->i_mode)) 2044 if (S_ISDIR(inode->i_mode))
2030 ocfs2_add_links_count(orphan_fe, -1); 2045 ocfs2_add_links_count(orphan_fe, -1);
2031 orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe); 2046 orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe);
2032 2047 ocfs2_journal_dirty(handle, orphan_dir_bh);
2033 status = ocfs2_journal_dirty(handle, orphan_dir_bh);
2034 if (status < 0) {
2035 mlog_errno(status);
2036 goto leave;
2037 }
2038 2048
2039leave: 2049leave:
2040 ocfs2_free_dir_lookup_result(&lookup); 2050 ocfs2_free_dir_lookup_result(&lookup);
@@ -2123,7 +2133,7 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
2123 } 2133 }
2124 2134
2125 di = (struct ocfs2_dinode *)new_di_bh->b_data; 2135 di = (struct ocfs2_dinode *)new_di_bh->b_data;
2126 status = ocfs2_orphan_add(osb, handle, inode, di, orphan_name, 2136 status = ocfs2_orphan_add(osb, handle, inode, new_di_bh, orphan_name,
2127 &orphan_insert, orphan_dir); 2137 &orphan_insert, orphan_dir);
2128 if (status < 0) { 2138 if (status < 0) {
2129 mlog_errno(status); 2139 mlog_errno(status);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 1238b491db90..c67003b6b5a2 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -47,6 +47,7 @@
47/* For struct ocfs2_blockcheck_stats */ 47/* For struct ocfs2_blockcheck_stats */
48#include "blockcheck.h" 48#include "blockcheck.h"
49 49
50#include "reservations.h"
50 51
51/* Caching of metadata buffers */ 52/* Caching of metadata buffers */
52 53
@@ -341,6 +342,9 @@ struct ocfs2_super
341 */ 342 */
342 unsigned int local_alloc_bits; 343 unsigned int local_alloc_bits;
343 unsigned int local_alloc_default_bits; 344 unsigned int local_alloc_default_bits;
345 /* osb_clusters_at_boot can become stale! Do not trust it to
346 * be up to date. */
347 unsigned int osb_clusters_at_boot;
344 348
345 enum ocfs2_local_alloc_state local_alloc_state; /* protected 349 enum ocfs2_local_alloc_state local_alloc_state; /* protected
346 * by osb_lock */ 350 * by osb_lock */
@@ -349,6 +353,11 @@ struct ocfs2_super
349 353
350 u64 la_last_gd; 354 u64 la_last_gd;
351 355
356 struct ocfs2_reservation_map osb_la_resmap;
357
358 unsigned int osb_resv_level;
359 unsigned int osb_dir_resv_level;
360
352 /* Next three fields are for local node slot recovery during 361 /* Next three fields are for local node slot recovery during
353 * mount. */ 362 * mount. */
354 int dirty; 363 int dirty;
@@ -482,6 +491,13 @@ static inline int ocfs2_supports_indexed_dirs(struct ocfs2_super *osb)
482 return 0; 491 return 0;
483} 492}
484 493
494static inline int ocfs2_supports_discontig_bg(struct ocfs2_super *osb)
495{
496 if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG)
497 return 1;
498 return 0;
499}
500
485static inline unsigned int ocfs2_link_max(struct ocfs2_super *osb) 501static inline unsigned int ocfs2_link_max(struct ocfs2_super *osb)
486{ 502{
487 if (ocfs2_supports_indexed_dirs(osb)) 503 if (ocfs2_supports_indexed_dirs(osb))
@@ -763,8 +779,24 @@ static inline unsigned int ocfs2_megabytes_to_clusters(struct super_block *sb,
763 return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits); 779 return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits);
764} 780}
765 781
766#define ocfs2_set_bit ext2_set_bit 782static inline unsigned int ocfs2_clusters_to_megabytes(struct super_block *sb,
767#define ocfs2_clear_bit ext2_clear_bit 783 unsigned int clusters)
784{
785 return clusters >> (20 - OCFS2_SB(sb)->s_clustersize_bits);
786}
787
788static inline void _ocfs2_set_bit(unsigned int bit, unsigned long *bitmap)
789{
790 ext2_set_bit(bit, bitmap);
791}
792#define ocfs2_set_bit(bit, addr) _ocfs2_set_bit((bit), (unsigned long *)(addr))
793
794static inline void _ocfs2_clear_bit(unsigned int bit, unsigned long *bitmap)
795{
796 ext2_clear_bit(bit, bitmap);
797}
798#define ocfs2_clear_bit(bit, addr) _ocfs2_clear_bit((bit), (unsigned long *)(addr))
799
768#define ocfs2_test_bit ext2_test_bit 800#define ocfs2_test_bit ext2_test_bit
769#define ocfs2_find_next_zero_bit ext2_find_next_zero_bit 801#define ocfs2_find_next_zero_bit ext2_find_next_zero_bit
770#define ocfs2_find_next_bit ext2_find_next_bit 802#define ocfs2_find_next_bit ext2_find_next_bit
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index bb37218a7978..33f1c9a8258d 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -100,7 +100,8 @@
100 | OCFS2_FEATURE_INCOMPAT_XATTR \ 100 | OCFS2_FEATURE_INCOMPAT_XATTR \
101 | OCFS2_FEATURE_INCOMPAT_META_ECC \ 101 | OCFS2_FEATURE_INCOMPAT_META_ECC \
102 | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \ 102 | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \
103 | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE) 103 | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE \
104 | OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG)
104#define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \ 105#define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
105 | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \ 106 | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
106 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA) 107 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
@@ -165,6 +166,9 @@
165/* Refcount tree support */ 166/* Refcount tree support */
166#define OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE 0x1000 167#define OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE 0x1000
167 168
169/* Discontigous block groups */
170#define OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG 0x2000
171
168/* 172/*
169 * backup superblock flag is used to indicate that this volume 173 * backup superblock flag is used to indicate that this volume
170 * has backup superblocks. 174 * has backup superblocks.
@@ -283,14 +287,6 @@
283#define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024) 287#define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024)
284 288
285/* 289/*
286 * Default local alloc size (in megabytes)
287 *
288 * The value chosen should be such that most allocations, including new
289 * block groups, use local alloc.
290 */
291#define OCFS2_DEFAULT_LOCAL_ALLOC_SIZE 8
292
293/*
294 * Inline extended attribute size (in bytes) 290 * Inline extended attribute size (in bytes)
295 * The value chosen should be aligned to 16 byte boundaries. 291 * The value chosen should be aligned to 16 byte boundaries.
296 */ 292 */
@@ -512,7 +508,10 @@ struct ocfs2_extent_block
512 block group */ 508 block group */
513 __le32 h_fs_generation; /* Must match super block */ 509 __le32 h_fs_generation; /* Must match super block */
514 __le64 h_blkno; /* Offset on disk, in blocks */ 510 __le64 h_blkno; /* Offset on disk, in blocks */
515/*20*/ __le64 h_reserved3; 511/*20*/ __le64 h_suballoc_loc; /* Suballocator block group this
512 eb belongs to. Only valid
513 if allocated from a
514 discontiguous block group */
516 __le64 h_next_leaf_blk; /* Offset on disk, in blocks, 515 __le64 h_next_leaf_blk; /* Offset on disk, in blocks,
517 of next leaf header pointing 516 of next leaf header pointing
518 to data */ 517 to data */
@@ -679,7 +678,11 @@ struct ocfs2_dinode {
679/*80*/ struct ocfs2_block_check i_check; /* Error checking */ 678/*80*/ struct ocfs2_block_check i_check; /* Error checking */
680/*88*/ __le64 i_dx_root; /* Pointer to dir index root block */ 679/*88*/ __le64 i_dx_root; /* Pointer to dir index root block */
681/*90*/ __le64 i_refcount_loc; 680/*90*/ __le64 i_refcount_loc;
682 __le64 i_reserved2[4]; 681 __le64 i_suballoc_loc; /* Suballocator block group this
682 inode belongs to. Only valid
683 if allocated from a
684 discontiguous block group */
685/*A0*/ __le64 i_reserved2[3];
683/*B8*/ union { 686/*B8*/ union {
684 __le64 i_pad1; /* Generic way to refer to this 687 __le64 i_pad1; /* Generic way to refer to this
685 64bit union */ 688 64bit union */
@@ -814,7 +817,12 @@ struct ocfs2_dx_root_block {
814 __le32 dr_reserved2; 817 __le32 dr_reserved2;
815 __le64 dr_free_blk; /* Pointer to head of free 818 __le64 dr_free_blk; /* Pointer to head of free
816 * unindexed block list. */ 819 * unindexed block list. */
817 __le64 dr_reserved3[15]; 820 __le64 dr_suballoc_loc; /* Suballocator block group
821 this root belongs to.
822 Only valid if allocated
823 from a discontiguous
824 block group */
825 __le64 dr_reserved3[14];
818 union { 826 union {
819 struct ocfs2_extent_list dr_list; /* Keep this aligned to 128 827 struct ocfs2_extent_list dr_list; /* Keep this aligned to 128
820 * bits for maximum space 828 * bits for maximum space
@@ -840,6 +848,13 @@ struct ocfs2_dx_leaf {
840}; 848};
841 849
842/* 850/*
851 * Largest bitmap for a block (suballocator) group in bytes. This limit
852 * does not affect cluster groups (global allocator). Cluster group
853 * bitmaps run to the end of the block.
854 */
855#define OCFS2_MAX_BG_BITMAP_SIZE 256
856
857/*
843 * On disk allocator group structure for OCFS2 858 * On disk allocator group structure for OCFS2
844 */ 859 */
845struct ocfs2_group_desc 860struct ocfs2_group_desc
@@ -860,7 +875,29 @@ struct ocfs2_group_desc
860 __le64 bg_blkno; /* Offset on disk, in blocks */ 875 __le64 bg_blkno; /* Offset on disk, in blocks */
861/*30*/ struct ocfs2_block_check bg_check; /* Error checking */ 876/*30*/ struct ocfs2_block_check bg_check; /* Error checking */
862 __le64 bg_reserved2; 877 __le64 bg_reserved2;
863/*40*/ __u8 bg_bitmap[0]; 878/*40*/ union {
879 __u8 bg_bitmap[0];
880 struct {
881 /*
882 * Block groups may be discontiguous when
883 * OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG is set.
884 * The extents of a discontigous block group are
885 * stored in bg_list. It is a flat list.
886 * l_tree_depth must always be zero. A
887 * discontiguous group is signified by a non-zero
888 * bg_list->l_next_free_rec. Only block groups
889 * can be discontiguous; Cluster groups cannot.
890 * We've never made a block group with more than
891 * 2048 blocks (256 bytes of bg_bitmap). This
892 * codifies that limit so that we can fit bg_list.
893 * bg_size of a discontiguous block group will
894 * be 256 to match bg_bitmap_filler.
895 */
896 __u8 bg_bitmap_filler[OCFS2_MAX_BG_BITMAP_SIZE];
897/*140*/ struct ocfs2_extent_list bg_list;
898 };
899 };
900/* Actual on-disk size is one block */
864}; 901};
865 902
866struct ocfs2_refcount_rec { 903struct ocfs2_refcount_rec {
@@ -905,7 +942,11 @@ struct ocfs2_refcount_block {
905/*40*/ __le32 rf_generation; /* generation number. all be the same 942/*40*/ __le32 rf_generation; /* generation number. all be the same
906 * for the same refcount tree. */ 943 * for the same refcount tree. */
907 __le32 rf_reserved0; 944 __le32 rf_reserved0;
908 __le64 rf_reserved1[7]; 945 __le64 rf_suballoc_loc; /* Suballocator block group this
946 refcount block belongs to. Only
947 valid if allocated from a
948 discontiguous block group */
949/*50*/ __le64 rf_reserved1[6];
909/*80*/ union { 950/*80*/ union {
910 struct ocfs2_refcount_list rf_records; /* List of refcount 951 struct ocfs2_refcount_list rf_records; /* List of refcount
911 records */ 952 records */
@@ -1017,7 +1058,10 @@ struct ocfs2_xattr_block {
1017 real xattr or a xattr tree. */ 1058 real xattr or a xattr tree. */
1018 __le16 xb_reserved0; 1059 __le16 xb_reserved0;
1019 __le32 xb_reserved1; 1060 __le32 xb_reserved1;
1020 __le64 xb_reserved2; 1061 __le64 xb_suballoc_loc; /* Suballocator block group this
1062 xattr block belongs to. Only
1063 valid if allocated from a
1064 discontiguous block group */
1021/*30*/ union { 1065/*30*/ union {
1022 struct ocfs2_xattr_header xb_header; /* xattr header if this 1066 struct ocfs2_xattr_header xb_header; /* xattr header if this
1023 block contains xattr */ 1067 block contains xattr */
@@ -1254,6 +1298,16 @@ static inline u16 ocfs2_extent_recs_per_eb(struct super_block *sb)
1254 return size / sizeof(struct ocfs2_extent_rec); 1298 return size / sizeof(struct ocfs2_extent_rec);
1255} 1299}
1256 1300
1301static inline u16 ocfs2_extent_recs_per_gd(struct super_block *sb)
1302{
1303 int size;
1304
1305 size = sb->s_blocksize -
1306 offsetof(struct ocfs2_group_desc, bg_list.l_recs);
1307
1308 return size / sizeof(struct ocfs2_extent_rec);
1309}
1310
1257static inline int ocfs2_dx_entries_per_leaf(struct super_block *sb) 1311static inline int ocfs2_dx_entries_per_leaf(struct super_block *sb)
1258{ 1312{
1259 int size; 1313 int size;
@@ -1284,13 +1338,23 @@ static inline u16 ocfs2_local_alloc_size(struct super_block *sb)
1284 return size; 1338 return size;
1285} 1339}
1286 1340
1287static inline int ocfs2_group_bitmap_size(struct super_block *sb) 1341static inline int ocfs2_group_bitmap_size(struct super_block *sb,
1342 int suballocator,
1343 u32 feature_incompat)
1288{ 1344{
1289 int size; 1345 int size = sb->s_blocksize -
1290
1291 size = sb->s_blocksize -
1292 offsetof(struct ocfs2_group_desc, bg_bitmap); 1346 offsetof(struct ocfs2_group_desc, bg_bitmap);
1293 1347
1348 /*
1349 * The cluster allocator uses the entire block. Suballocators have
1350 * never used more than OCFS2_MAX_BG_BITMAP_SIZE. Unfortunately, older
1351 * code expects bg_size set to the maximum. Thus we must keep
1352 * bg_size as-is unless discontig_bg is enabled.
1353 */
1354 if (suballocator &&
1355 (feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG))
1356 size = OCFS2_MAX_BG_BITMAP_SIZE;
1357
1294 return size; 1358 return size;
1295} 1359}
1296 1360
@@ -1402,23 +1466,43 @@ static inline int ocfs2_extent_recs_per_eb(int blocksize)
1402 return size / sizeof(struct ocfs2_extent_rec); 1466 return size / sizeof(struct ocfs2_extent_rec);
1403} 1467}
1404 1468
1405static inline int ocfs2_local_alloc_size(int blocksize) 1469static inline int ocfs2_extent_recs_per_gd(int blocksize)
1406{ 1470{
1407 int size; 1471 int size;
1408 1472
1409 size = blocksize - 1473 size = blocksize -
1410 offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap); 1474 offsetof(struct ocfs2_group_desc, bg_list.l_recs);
1411 1475
1412 return size; 1476 return size / sizeof(struct ocfs2_extent_rec);
1413} 1477}
1414 1478
1415static inline int ocfs2_group_bitmap_size(int blocksize) 1479static inline int ocfs2_local_alloc_size(int blocksize)
1416{ 1480{
1417 int size; 1481 int size;
1418 1482
1419 size = blocksize - 1483 size = blocksize -
1484 offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap);
1485
1486 return size;
1487}
1488
1489static inline int ocfs2_group_bitmap_size(int blocksize,
1490 int suballocator,
1491 uint32_t feature_incompat)
1492{
1493 int size = sb->s_blocksize -
1420 offsetof(struct ocfs2_group_desc, bg_bitmap); 1494 offsetof(struct ocfs2_group_desc, bg_bitmap);
1421 1495
1496 /*
1497 * The cluster allocator uses the entire block. Suballocators have
1498 * never used more than OCFS2_MAX_BG_BITMAP_SIZE. Unfortunately, older
1499 * code expects bg_size set to the maximum. Thus we must keep
1500 * bg_size as-is unless discontig_bg is enabled.
1501 */
1502 if (suballocator &&
1503 (feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG))
1504 size = OCFS2_MAX_BG_BITMAP_SIZE;
1505
1422 return size; 1506 return size;
1423} 1507}
1424 1508
@@ -1491,5 +1575,19 @@ static inline void ocfs2_set_de_type(struct ocfs2_dir_entry *de,
1491 de->file_type = ocfs2_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; 1575 de->file_type = ocfs2_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
1492} 1576}
1493 1577
1578static inline int ocfs2_gd_is_discontig(struct ocfs2_group_desc *gd)
1579{
1580 if ((offsetof(struct ocfs2_group_desc, bg_bitmap) +
1581 le16_to_cpu(gd->bg_size)) !=
1582 offsetof(struct ocfs2_group_desc, bg_list))
1583 return 0;
1584 /*
1585 * Only valid to check l_next_free_rec if
1586 * bg_bitmap + bg_size == bg_list.
1587 */
1588 if (!gd->bg_list.l_next_free_rec)
1589 return 0;
1590 return 1;
1591}
1494#endif /* _OCFS2_FS_H */ 1592#endif /* _OCFS2_FS_H */
1495 1593
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
index 123bc520a2c0..196fcb52d95d 100644
--- a/fs/ocfs2/quota.h
+++ b/fs/ocfs2/quota.h
@@ -23,6 +23,7 @@
23struct ocfs2_dquot { 23struct ocfs2_dquot {
24 struct dquot dq_dquot; /* Generic VFS dquot */ 24 struct dquot dq_dquot; /* Generic VFS dquot */
25 loff_t dq_local_off; /* Offset in the local quota file */ 25 loff_t dq_local_off; /* Offset in the local quota file */
26 u64 dq_local_phys_blk; /* Physical block carrying quota structure */
26 struct ocfs2_quota_chunk *dq_chunk; /* Chunk dquot is in */ 27 struct ocfs2_quota_chunk *dq_chunk; /* Chunk dquot is in */
27 unsigned int dq_use_count; /* Number of nodes having reference to this entry in global quota file */ 28 unsigned int dq_use_count; /* Number of nodes having reference to this entry in global quota file */
28 s64 dq_origspace; /* Last globally synced space usage */ 29 s64 dq_origspace; /* Last globally synced space usage */
@@ -51,8 +52,9 @@ struct ocfs2_mem_dqinfo {
51 struct ocfs2_lock_res dqi_gqlock; /* Lock protecting quota information structure */ 52 struct ocfs2_lock_res dqi_gqlock; /* Lock protecting quota information structure */
52 struct buffer_head *dqi_gqi_bh; /* Buffer head with global quota file inode - set only if inode lock is obtained */ 53 struct buffer_head *dqi_gqi_bh; /* Buffer head with global quota file inode - set only if inode lock is obtained */
53 int dqi_gqi_count; /* Number of holders of dqi_gqi_bh */ 54 int dqi_gqi_count; /* Number of holders of dqi_gqi_bh */
55 u64 dqi_giblk; /* Number of block with global information header */
54 struct buffer_head *dqi_lqi_bh; /* Buffer head with local quota file inode */ 56 struct buffer_head *dqi_lqi_bh; /* Buffer head with local quota file inode */
55 struct buffer_head *dqi_ibh; /* Buffer with information header */ 57 struct buffer_head *dqi_libh; /* Buffer with local information header */
56 struct qtree_mem_dqinfo dqi_gi; /* Info about global file */ 58 struct qtree_mem_dqinfo dqi_gi; /* Info about global file */
57 struct delayed_work dqi_sync_work; /* Work for syncing dquots */ 59 struct delayed_work dqi_sync_work; /* Work for syncing dquots */
58 struct ocfs2_quota_recovery *dqi_rec; /* Pointer to recovery 60 struct ocfs2_quota_recovery *dqi_rec; /* Pointer to recovery
@@ -102,8 +104,12 @@ static inline int ocfs2_global_release_dquot(struct dquot *dquot)
102 104
103int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex); 105int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex);
104void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex); 106void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex);
105int ocfs2_read_quota_block(struct inode *inode, u64 v_block, 107int ocfs2_validate_quota_block(struct super_block *sb, struct buffer_head *bh);
106 struct buffer_head **bh); 108int ocfs2_read_quota_phys_block(struct inode *inode, u64 p_block,
109 struct buffer_head **bh);
110int ocfs2_create_local_dquot(struct dquot *dquot);
111int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot);
112int ocfs2_local_write_dquot(struct dquot *dquot);
107 113
108extern const struct dquot_operations ocfs2_quota_operations; 114extern const struct dquot_operations ocfs2_quota_operations;
109extern struct quota_format_type ocfs2_quota_format; 115extern struct quota_format_type ocfs2_quota_format;
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 355f41d1d520..2bb35fe00511 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -3,6 +3,7 @@
3 */ 3 */
4#include <linux/spinlock.h> 4#include <linux/spinlock.h>
5#include <linux/fs.h> 5#include <linux/fs.h>
6#include <linux/slab.h>
6#include <linux/quota.h> 7#include <linux/quota.h>
7#include <linux/quotaops.h> 8#include <linux/quotaops.h>
8#include <linux/dqblk_qtree.h> 9#include <linux/dqblk_qtree.h>
@@ -24,8 +25,44 @@
24#include "dlmglue.h" 25#include "dlmglue.h"
25#include "uptodate.h" 26#include "uptodate.h"
26#include "super.h" 27#include "super.h"
28#include "buffer_head_io.h"
27#include "quota.h" 29#include "quota.h"
28 30
31/*
32 * Locking of quotas with OCFS2 is rather complex. Here are rules that
33 * should be obeyed by all the functions:
34 * - any write of quota structure (either to local or global file) is protected
35 * by dqio_mutex or dquot->dq_lock.
36 * - any modification of global quota file holds inode cluster lock, i_mutex,
37 * and ip_alloc_sem of the global quota file (achieved by
38 * ocfs2_lock_global_qf). It also has to hold qinfo_lock.
39 * - an allocation of new blocks for local quota file is protected by
40 * its ip_alloc_sem
41 *
42 * A rough sketch of locking dependencies (lf = local file, gf = global file):
43 * Normal filesystem operation:
44 * start_trans -> dqio_mutex -> write to lf
45 * Syncing of local and global file:
46 * ocfs2_lock_global_qf -> start_trans -> dqio_mutex -> qinfo_lock ->
47 * write to gf
48 * -> write to lf
49 * Acquire dquot for the first time:
50 * dq_lock -> ocfs2_lock_global_qf -> qinfo_lock -> read from gf
51 * -> alloc space for gf
52 * -> start_trans -> qinfo_lock -> write to gf
53 * -> ip_alloc_sem of lf -> alloc space for lf
54 * -> write to lf
55 * Release last reference to dquot:
56 * dq_lock -> ocfs2_lock_global_qf -> start_trans -> qinfo_lock -> write to gf
57 * -> write to lf
58 * Note that all the above operations also hold the inode cluster lock of lf.
59 * Recovery:
60 * inode cluster lock of recovered lf
61 * -> read bitmaps -> ip_alloc_sem of lf
62 * -> ocfs2_lock_global_qf -> start_trans -> dqio_mutex -> qinfo_lock ->
63 * write to gf
64 */
65
29static struct workqueue_struct *ocfs2_quota_wq = NULL; 66static struct workqueue_struct *ocfs2_quota_wq = NULL;
30 67
31static void qsync_work_fn(struct work_struct *work); 68static void qsync_work_fn(struct work_struct *work);
@@ -90,8 +127,7 @@ struct qtree_fmt_operations ocfs2_global_ops = {
90 .is_id = ocfs2_global_is_id, 127 .is_id = ocfs2_global_is_id,
91}; 128};
92 129
93static int ocfs2_validate_quota_block(struct super_block *sb, 130int ocfs2_validate_quota_block(struct super_block *sb, struct buffer_head *bh)
94 struct buffer_head *bh)
95{ 131{
96 struct ocfs2_disk_dqtrailer *dqt = 132 struct ocfs2_disk_dqtrailer *dqt =
97 ocfs2_block_dqtrailer(sb->s_blocksize, bh->b_data); 133 ocfs2_block_dqtrailer(sb->s_blocksize, bh->b_data);
@@ -109,54 +145,19 @@ static int ocfs2_validate_quota_block(struct super_block *sb,
109 return ocfs2_validate_meta_ecc(sb, bh->b_data, &dqt->dq_check); 145 return ocfs2_validate_meta_ecc(sb, bh->b_data, &dqt->dq_check);
110} 146}
111 147
112int ocfs2_read_quota_block(struct inode *inode, u64 v_block, 148int ocfs2_read_quota_phys_block(struct inode *inode, u64 p_block,
113 struct buffer_head **bh) 149 struct buffer_head **bhp)
114{ 150{
115 int rc = 0; 151 int rc;
116 struct buffer_head *tmp = *bh; 152
117 153 *bhp = NULL;
118 if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) { 154 rc = ocfs2_read_blocks(INODE_CACHE(inode), p_block, 1, bhp, 0,
119 ocfs2_error(inode->i_sb, 155 ocfs2_validate_quota_block);
120 "Quota file %llu is probably corrupted! Requested "
121 "to read block %Lu but file has size only %Lu\n",
122 (unsigned long long)OCFS2_I(inode)->ip_blkno,
123 (unsigned long long)v_block,
124 (unsigned long long)i_size_read(inode));
125 return -EIO;
126 }
127 rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0,
128 ocfs2_validate_quota_block);
129 if (rc) 156 if (rc)
130 mlog_errno(rc); 157 mlog_errno(rc);
131
132 /* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */
133 if (!rc && !*bh)
134 *bh = tmp;
135
136 return rc; 158 return rc;
137} 159}
138 160
139static int ocfs2_get_quota_block(struct inode *inode, int block,
140 struct buffer_head **bh)
141{
142 u64 pblock, pcount;
143 int err;
144
145 down_read(&OCFS2_I(inode)->ip_alloc_sem);
146 err = ocfs2_extent_map_get_blocks(inode, block, &pblock, &pcount, NULL);
147 up_read(&OCFS2_I(inode)->ip_alloc_sem);
148 if (err) {
149 mlog_errno(err);
150 return err;
151 }
152 *bh = sb_getblk(inode->i_sb, pblock);
153 if (!*bh) {
154 err = -EIO;
155 mlog_errno(err);
156 }
157 return err;
158}
159
160/* Read data from global quotafile - avoid pagecache and such because we cannot 161/* Read data from global quotafile - avoid pagecache and such because we cannot
161 * afford acquiring the locks... We use quota cluster lock to serialize 162 * afford acquiring the locks... We use quota cluster lock to serialize
162 * operations. Caller is responsible for acquiring it. */ 163 * operations. Caller is responsible for acquiring it. */
@@ -171,6 +172,7 @@ ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data,
171 int err = 0; 172 int err = 0;
172 struct buffer_head *bh; 173 struct buffer_head *bh;
173 size_t toread, tocopy; 174 size_t toread, tocopy;
175 u64 pblock = 0, pcount = 0;
174 176
175 if (off > i_size) 177 if (off > i_size)
176 return 0; 178 return 0;
@@ -179,8 +181,19 @@ ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data,
179 toread = len; 181 toread = len;
180 while (toread > 0) { 182 while (toread > 0) {
181 tocopy = min_t(size_t, (sb->s_blocksize - offset), toread); 183 tocopy = min_t(size_t, (sb->s_blocksize - offset), toread);
184 if (!pcount) {
185 err = ocfs2_extent_map_get_blocks(gqinode, blk, &pblock,
186 &pcount, NULL);
187 if (err) {
188 mlog_errno(err);
189 return err;
190 }
191 } else {
192 pcount--;
193 pblock++;
194 }
182 bh = NULL; 195 bh = NULL;
183 err = ocfs2_read_quota_block(gqinode, blk, &bh); 196 err = ocfs2_read_quota_phys_block(gqinode, pblock, &bh);
184 if (err) { 197 if (err) {
185 mlog_errno(err); 198 mlog_errno(err);
186 return err; 199 return err;
@@ -208,6 +221,7 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
208 int err = 0, new = 0, ja_type; 221 int err = 0, new = 0, ja_type;
209 struct buffer_head *bh = NULL; 222 struct buffer_head *bh = NULL;
210 handle_t *handle = journal_current_handle(); 223 handle_t *handle = journal_current_handle();
224 u64 pblock, pcount;
211 225
212 if (!handle) { 226 if (!handle) {
213 mlog(ML_ERROR, "Quota write (off=%llu, len=%llu) cancelled " 227 mlog(ML_ERROR, "Quota write (off=%llu, len=%llu) cancelled "
@@ -220,12 +234,11 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
220 len = sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE - offset; 234 len = sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE - offset;
221 } 235 }
222 236
223 mutex_lock_nested(&gqinode->i_mutex, I_MUTEX_QUOTA);
224 if (gqinode->i_size < off + len) { 237 if (gqinode->i_size < off + len) {
225 loff_t rounded_end = 238 loff_t rounded_end =
226 ocfs2_align_bytes_to_blocks(sb, off + len); 239 ocfs2_align_bytes_to_blocks(sb, off + len);
227 240
228 /* Space is already allocated in ocfs2_global_read_dquot() */ 241 /* Space is already allocated in ocfs2_acquire_dquot() */
229 err = ocfs2_simple_size_update(gqinode, 242 err = ocfs2_simple_size_update(gqinode,
230 oinfo->dqi_gqi_bh, 243 oinfo->dqi_gqi_bh,
231 rounded_end); 244 rounded_end);
@@ -233,13 +246,20 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
233 goto out; 246 goto out;
234 new = 1; 247 new = 1;
235 } 248 }
249 err = ocfs2_extent_map_get_blocks(gqinode, blk, &pblock, &pcount, NULL);
250 if (err) {
251 mlog_errno(err);
252 goto out;
253 }
236 /* Not rewriting whole block? */ 254 /* Not rewriting whole block? */
237 if ((offset || len < sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE) && 255 if ((offset || len < sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE) &&
238 !new) { 256 !new) {
239 err = ocfs2_read_quota_block(gqinode, blk, &bh); 257 err = ocfs2_read_quota_phys_block(gqinode, pblock, &bh);
240 ja_type = OCFS2_JOURNAL_ACCESS_WRITE; 258 ja_type = OCFS2_JOURNAL_ACCESS_WRITE;
241 } else { 259 } else {
242 err = ocfs2_get_quota_block(gqinode, blk, &bh); 260 bh = sb_getblk(sb, pblock);
261 if (!bh)
262 err = -ENOMEM;
243 ja_type = OCFS2_JOURNAL_ACCESS_CREATE; 263 ja_type = OCFS2_JOURNAL_ACCESS_CREATE;
244 } 264 }
245 if (err) { 265 if (err) {
@@ -260,19 +280,15 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
260 brelse(bh); 280 brelse(bh);
261 goto out; 281 goto out;
262 } 282 }
263 err = ocfs2_journal_dirty(handle, bh); 283 ocfs2_journal_dirty(handle, bh);
264 brelse(bh); 284 brelse(bh);
265 if (err < 0)
266 goto out;
267out: 285out:
268 if (err) { 286 if (err) {
269 mutex_unlock(&gqinode->i_mutex);
270 mlog_errno(err); 287 mlog_errno(err);
271 return err; 288 return err;
272 } 289 }
273 gqinode->i_version++; 290 gqinode->i_version++;
274 ocfs2_mark_inode_dirty(handle, gqinode, oinfo->dqi_gqi_bh); 291 ocfs2_mark_inode_dirty(handle, gqinode, oinfo->dqi_gqi_bh);
275 mutex_unlock(&gqinode->i_mutex);
276 return len; 292 return len;
277} 293}
278 294
@@ -290,11 +306,23 @@ int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
290 else 306 else
291 WARN_ON(bh != oinfo->dqi_gqi_bh); 307 WARN_ON(bh != oinfo->dqi_gqi_bh);
292 spin_unlock(&dq_data_lock); 308 spin_unlock(&dq_data_lock);
309 if (ex) {
310 mutex_lock(&oinfo->dqi_gqinode->i_mutex);
311 down_write(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
312 } else {
313 down_read(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
314 }
293 return 0; 315 return 0;
294} 316}
295 317
296void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex) 318void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
297{ 319{
320 if (ex) {
321 up_write(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
322 mutex_unlock(&oinfo->dqi_gqinode->i_mutex);
323 } else {
324 up_read(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
325 }
298 ocfs2_inode_unlock(oinfo->dqi_gqinode, ex); 326 ocfs2_inode_unlock(oinfo->dqi_gqinode, ex);
299 brelse(oinfo->dqi_gqi_bh); 327 brelse(oinfo->dqi_gqi_bh);
300 spin_lock(&dq_data_lock); 328 spin_lock(&dq_data_lock);
@@ -312,6 +340,7 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
312 struct ocfs2_global_disk_dqinfo dinfo; 340 struct ocfs2_global_disk_dqinfo dinfo;
313 struct mem_dqinfo *info = sb_dqinfo(sb, type); 341 struct mem_dqinfo *info = sb_dqinfo(sb, type);
314 struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv; 342 struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
343 u64 pcount;
315 int status; 344 int status;
316 345
317 mlog_entry_void(); 346 mlog_entry_void();
@@ -338,9 +367,19 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
338 mlog_errno(status); 367 mlog_errno(status);
339 goto out_err; 368 goto out_err;
340 } 369 }
370
371 status = ocfs2_extent_map_get_blocks(gqinode, 0, &oinfo->dqi_giblk,
372 &pcount, NULL);
373 if (status < 0)
374 goto out_unlock;
375
376 status = ocfs2_qinfo_lock(oinfo, 0);
377 if (status < 0)
378 goto out_unlock;
341 status = sb->s_op->quota_read(sb, type, (char *)&dinfo, 379 status = sb->s_op->quota_read(sb, type, (char *)&dinfo,
342 sizeof(struct ocfs2_global_disk_dqinfo), 380 sizeof(struct ocfs2_global_disk_dqinfo),
343 OCFS2_GLOBAL_INFO_OFF); 381 OCFS2_GLOBAL_INFO_OFF);
382 ocfs2_qinfo_unlock(oinfo, 0);
344 ocfs2_unlock_global_qf(oinfo, 0); 383 ocfs2_unlock_global_qf(oinfo, 0);
345 if (status != sizeof(struct ocfs2_global_disk_dqinfo)) { 384 if (status != sizeof(struct ocfs2_global_disk_dqinfo)) {
346 mlog(ML_ERROR, "Cannot read global quota info (%d).\n", 385 mlog(ML_ERROR, "Cannot read global quota info (%d).\n",
@@ -367,6 +406,10 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
367out_err: 406out_err:
368 mlog_exit(status); 407 mlog_exit(status);
369 return status; 408 return status;
409out_unlock:
410 ocfs2_unlock_global_qf(oinfo, 0);
411 mlog_errno(status);
412 goto out_err;
370} 413}
371 414
372/* Write information to global quota file. Expects exlusive lock on quota 415/* Write information to global quota file. Expects exlusive lock on quota
@@ -425,78 +468,10 @@ static int ocfs2_global_qinit_alloc(struct super_block *sb, int type)
425 468
426static int ocfs2_calc_global_qinit_credits(struct super_block *sb, int type) 469static int ocfs2_calc_global_qinit_credits(struct super_block *sb, int type)
427{ 470{
428 /* We modify all the allocated blocks, tree root, and info block */ 471 /* We modify all the allocated blocks, tree root, info block and
472 * the inode */
429 return (ocfs2_global_qinit_alloc(sb, type) + 2) * 473 return (ocfs2_global_qinit_alloc(sb, type) + 2) *
430 OCFS2_QUOTA_BLOCK_UPDATE_CREDITS; 474 OCFS2_QUOTA_BLOCK_UPDATE_CREDITS + 1;
431}
432
433/* Read in information from global quota file and acquire a reference to it.
434 * dquot_acquire() has already started the transaction and locked quota file */
435int ocfs2_global_read_dquot(struct dquot *dquot)
436{
437 int err, err2, ex = 0;
438 struct super_block *sb = dquot->dq_sb;
439 int type = dquot->dq_type;
440 struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
441 struct ocfs2_super *osb = OCFS2_SB(sb);
442 struct inode *gqinode = info->dqi_gqinode;
443 int need_alloc = ocfs2_global_qinit_alloc(sb, type);
444 handle_t *handle = NULL;
445
446 err = ocfs2_qinfo_lock(info, 0);
447 if (err < 0)
448 goto out;
449 err = qtree_read_dquot(&info->dqi_gi, dquot);
450 if (err < 0)
451 goto out_qlock;
452 OCFS2_DQUOT(dquot)->dq_use_count++;
453 OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
454 OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
455 ocfs2_qinfo_unlock(info, 0);
456
457 if (!dquot->dq_off) { /* No real quota entry? */
458 ex = 1;
459 /*
460 * Add blocks to quota file before we start a transaction since
461 * locking allocators ranks above a transaction start
462 */
463 WARN_ON(journal_current_handle());
464 down_write(&OCFS2_I(gqinode)->ip_alloc_sem);
465 err = ocfs2_extend_no_holes(gqinode,
466 gqinode->i_size + (need_alloc << sb->s_blocksize_bits),
467 gqinode->i_size);
468 up_write(&OCFS2_I(gqinode)->ip_alloc_sem);
469 if (err < 0)
470 goto out;
471 }
472
473 handle = ocfs2_start_trans(osb,
474 ocfs2_calc_global_qinit_credits(sb, type));
475 if (IS_ERR(handle)) {
476 err = PTR_ERR(handle);
477 goto out;
478 }
479 err = ocfs2_qinfo_lock(info, ex);
480 if (err < 0)
481 goto out_trans;
482 err = qtree_write_dquot(&info->dqi_gi, dquot);
483 if (ex && info_dirty(sb_dqinfo(dquot->dq_sb, dquot->dq_type))) {
484 err2 = __ocfs2_global_write_info(dquot->dq_sb, dquot->dq_type);
485 if (!err)
486 err = err2;
487 }
488out_qlock:
489 if (ex)
490 ocfs2_qinfo_unlock(info, 1);
491 else
492 ocfs2_qinfo_unlock(info, 0);
493out_trans:
494 if (handle)
495 ocfs2_commit_trans(osb, handle);
496out:
497 if (err < 0)
498 mlog_errno(err);
499 return err;
500} 475}
501 476
502/* Sync local information about quota modifications with global quota file. 477/* Sync local information about quota modifications with global quota file.
@@ -637,14 +612,13 @@ static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type)
637 } 612 }
638 mutex_lock(&sb_dqopt(sb)->dqio_mutex); 613 mutex_lock(&sb_dqopt(sb)->dqio_mutex);
639 status = ocfs2_sync_dquot(dquot); 614 status = ocfs2_sync_dquot(dquot);
640 mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
641 if (status < 0) 615 if (status < 0)
642 mlog_errno(status); 616 mlog_errno(status);
643 /* We have to write local structure as well... */ 617 /* We have to write local structure as well... */
644 dquot_mark_dquot_dirty(dquot); 618 status = ocfs2_local_write_dquot(dquot);
645 status = dquot_commit(dquot);
646 if (status < 0) 619 if (status < 0)
647 mlog_errno(status); 620 mlog_errno(status);
621 mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
648 ocfs2_commit_trans(osb, handle); 622 ocfs2_commit_trans(osb, handle);
649out_ilock: 623out_ilock:
650 ocfs2_unlock_global_qf(oinfo, 1); 624 ocfs2_unlock_global_qf(oinfo, 1);
@@ -683,7 +657,9 @@ static int ocfs2_write_dquot(struct dquot *dquot)
683 mlog_errno(status); 657 mlog_errno(status);
684 goto out; 658 goto out;
685 } 659 }
686 status = dquot_commit(dquot); 660 mutex_lock(&sb_dqopt(dquot->dq_sb)->dqio_mutex);
661 status = ocfs2_local_write_dquot(dquot);
662 mutex_unlock(&sb_dqopt(dquot->dq_sb)->dqio_mutex);
687 ocfs2_commit_trans(osb, handle); 663 ocfs2_commit_trans(osb, handle);
688out: 664out:
689 mlog_exit(status); 665 mlog_exit(status);
@@ -714,6 +690,10 @@ static int ocfs2_release_dquot(struct dquot *dquot)
714 690
715 mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type); 691 mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
716 692
693 mutex_lock(&dquot->dq_lock);
694 /* Check whether we are not racing with some other dqget() */
695 if (atomic_read(&dquot->dq_count) > 1)
696 goto out;
717 status = ocfs2_lock_global_qf(oinfo, 1); 697 status = ocfs2_lock_global_qf(oinfo, 1);
718 if (status < 0) 698 if (status < 0)
719 goto out; 699 goto out;
@@ -724,30 +704,113 @@ static int ocfs2_release_dquot(struct dquot *dquot)
724 mlog_errno(status); 704 mlog_errno(status);
725 goto out_ilock; 705 goto out_ilock;
726 } 706 }
727 status = dquot_release(dquot); 707
708 status = ocfs2_global_release_dquot(dquot);
709 if (status < 0) {
710 mlog_errno(status);
711 goto out_trans;
712 }
713 status = ocfs2_local_release_dquot(handle, dquot);
714 /*
715 * If we fail here, we cannot do much as global structure is
716 * already released. So just complain...
717 */
718 if (status < 0)
719 mlog_errno(status);
720 clear_bit(DQ_ACTIVE_B, &dquot->dq_flags);
721out_trans:
728 ocfs2_commit_trans(osb, handle); 722 ocfs2_commit_trans(osb, handle);
729out_ilock: 723out_ilock:
730 ocfs2_unlock_global_qf(oinfo, 1); 724 ocfs2_unlock_global_qf(oinfo, 1);
731out: 725out:
726 mutex_unlock(&dquot->dq_lock);
732 mlog_exit(status); 727 mlog_exit(status);
733 return status; 728 return status;
734} 729}
735 730
731/*
732 * Read global dquot structure from disk or create it if it does
733 * not exist. Also update use count of the global structure and
734 * create structure in node-local quota file.
735 */
736static int ocfs2_acquire_dquot(struct dquot *dquot) 736static int ocfs2_acquire_dquot(struct dquot *dquot)
737{ 737{
738 struct ocfs2_mem_dqinfo *oinfo = 738 int status = 0, err;
739 sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv; 739 int ex = 0;
740 int status = 0; 740 struct super_block *sb = dquot->dq_sb;
741 struct ocfs2_super *osb = OCFS2_SB(sb);
742 int type = dquot->dq_type;
743 struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
744 struct inode *gqinode = info->dqi_gqinode;
745 int need_alloc = ocfs2_global_qinit_alloc(sb, type);
746 handle_t *handle;
741 747
742 mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type); 748 mlog_entry("id=%u, type=%d", dquot->dq_id, type);
743 /* We need an exclusive lock, because we're going to update use count 749 mutex_lock(&dquot->dq_lock);
744 * and instantiate possibly new dquot structure */ 750 /*
745 status = ocfs2_lock_global_qf(oinfo, 1); 751 * We need an exclusive lock, because we're going to update use count
752 * and instantiate possibly new dquot structure
753 */
754 status = ocfs2_lock_global_qf(info, 1);
746 if (status < 0) 755 if (status < 0)
747 goto out; 756 goto out;
748 status = dquot_acquire(dquot); 757 if (!test_bit(DQ_READ_B, &dquot->dq_flags)) {
749 ocfs2_unlock_global_qf(oinfo, 1); 758 status = ocfs2_qinfo_lock(info, 0);
759 if (status < 0)
760 goto out_dq;
761 status = qtree_read_dquot(&info->dqi_gi, dquot);
762 ocfs2_qinfo_unlock(info, 0);
763 if (status < 0)
764 goto out_dq;
765 }
766 set_bit(DQ_READ_B, &dquot->dq_flags);
767
768 OCFS2_DQUOT(dquot)->dq_use_count++;
769 OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
770 OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
771 if (!dquot->dq_off) { /* No real quota entry? */
772 ex = 1;
773 /*
774 * Add blocks to quota file before we start a transaction since
775 * locking allocators ranks above a transaction start
776 */
777 WARN_ON(journal_current_handle());
778 status = ocfs2_extend_no_holes(gqinode,
779 gqinode->i_size + (need_alloc << sb->s_blocksize_bits),
780 gqinode->i_size);
781 if (status < 0)
782 goto out_dq;
783 }
784
785 handle = ocfs2_start_trans(osb,
786 ocfs2_calc_global_qinit_credits(sb, type));
787 if (IS_ERR(handle)) {
788 status = PTR_ERR(handle);
789 goto out_dq;
790 }
791 status = ocfs2_qinfo_lock(info, ex);
792 if (status < 0)
793 goto out_trans;
794 status = qtree_write_dquot(&info->dqi_gi, dquot);
795 if (ex && info_dirty(sb_dqinfo(sb, type))) {
796 err = __ocfs2_global_write_info(sb, type);
797 if (!status)
798 status = err;
799 }
800 ocfs2_qinfo_unlock(info, ex);
801out_trans:
802 ocfs2_commit_trans(osb, handle);
803out_dq:
804 ocfs2_unlock_global_qf(info, 1);
805 if (status < 0)
806 goto out;
807
808 status = ocfs2_create_local_dquot(dquot);
809 if (status < 0)
810 goto out;
811 set_bit(DQ_ACTIVE_B, &dquot->dq_flags);
750out: 812out:
813 mutex_unlock(&dquot->dq_lock);
751 mlog_exit(status); 814 mlog_exit(status);
752 return status; 815 return status;
753} 816}
@@ -769,7 +832,6 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
769 struct ocfs2_super *osb = OCFS2_SB(sb); 832 struct ocfs2_super *osb = OCFS2_SB(sb);
770 833
771 mlog_entry("id=%u, type=%d", dquot->dq_id, type); 834 mlog_entry("id=%u, type=%d", dquot->dq_id, type);
772 dquot_mark_dquot_dirty(dquot);
773 835
774 /* In case user set some limits, sync dquot immediately to global 836 /* In case user set some limits, sync dquot immediately to global
775 * quota file so that information propagates quicker */ 837 * quota file so that information propagates quicker */
@@ -792,14 +854,16 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
792 mlog_errno(status); 854 mlog_errno(status);
793 goto out_ilock; 855 goto out_ilock;
794 } 856 }
857 mutex_lock(&sb_dqopt(sb)->dqio_mutex);
795 status = ocfs2_sync_dquot(dquot); 858 status = ocfs2_sync_dquot(dquot);
796 if (status < 0) { 859 if (status < 0) {
797 mlog_errno(status); 860 mlog_errno(status);
798 goto out_trans; 861 goto out_dlock;
799 } 862 }
800 /* Now write updated local dquot structure */ 863 /* Now write updated local dquot structure */
801 status = dquot_commit(dquot); 864 status = ocfs2_local_write_dquot(dquot);
802out_trans: 865out_dlock:
866 mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
803 ocfs2_commit_trans(osb, handle); 867 ocfs2_commit_trans(osb, handle);
804out_ilock: 868out_ilock:
805 ocfs2_unlock_global_qf(oinfo, 1); 869 ocfs2_unlock_global_qf(oinfo, 1);
@@ -851,7 +915,7 @@ static void ocfs2_destroy_dquot(struct dquot *dquot)
851} 915}
852 916
853const struct dquot_operations ocfs2_quota_operations = { 917const struct dquot_operations ocfs2_quota_operations = {
854 .write_dquot = ocfs2_write_dquot, 918 /* We never make dquot dirty so .write_dquot is never called */
855 .acquire_dquot = ocfs2_acquire_dquot, 919 .acquire_dquot = ocfs2_acquire_dquot,
856 .release_dquot = ocfs2_release_dquot, 920 .release_dquot = ocfs2_release_dquot,
857 .mark_dirty = ocfs2_mark_dquot_dirty, 921 .mark_dirty = ocfs2_mark_dquot_dirty,
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index a6467f3d262e..8bd70d4d184d 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -3,6 +3,7 @@
3 */ 3 */
4 4
5#include <linux/fs.h> 5#include <linux/fs.h>
6#include <linux/slab.h>
6#include <linux/quota.h> 7#include <linux/quota.h>
7#include <linux/quotaops.h> 8#include <linux/quotaops.h>
8#include <linux/module.h> 9#include <linux/module.h>
@@ -21,6 +22,7 @@
21#include "dlmglue.h" 22#include "dlmglue.h"
22#include "quota.h" 23#include "quota.h"
23#include "uptodate.h" 24#include "uptodate.h"
25#include "super.h"
24 26
25/* Number of local quota structures per block */ 27/* Number of local quota structures per block */
26static inline unsigned int ol_quota_entries_per_block(struct super_block *sb) 28static inline unsigned int ol_quota_entries_per_block(struct super_block *sb)
@@ -118,12 +120,8 @@ static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh,
118 lock_buffer(bh); 120 lock_buffer(bh);
119 modify(bh, private); 121 modify(bh, private);
120 unlock_buffer(bh); 122 unlock_buffer(bh);
121 status = ocfs2_journal_dirty(handle, bh); 123 ocfs2_journal_dirty(handle, bh);
122 if (status < 0) { 124
123 mlog_errno(status);
124 ocfs2_commit_trans(OCFS2_SB(sb), handle);
125 return status;
126 }
127 status = ocfs2_commit_trans(OCFS2_SB(sb), handle); 125 status = ocfs2_commit_trans(OCFS2_SB(sb), handle);
128 if (status < 0) { 126 if (status < 0) {
129 mlog_errno(status); 127 mlog_errno(status);
@@ -132,6 +130,39 @@ static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh,
132 return 0; 130 return 0;
133} 131}
134 132
133/*
134 * Read quota block from a given logical offset.
135 *
136 * This function acquires ip_alloc_sem and thus it must not be called with a
137 * transaction started.
138 */
139static int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
140 struct buffer_head **bh)
141{
142 int rc = 0;
143 struct buffer_head *tmp = *bh;
144
145 if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) {
146 ocfs2_error(inode->i_sb,
147 "Quota file %llu is probably corrupted! Requested "
148 "to read block %Lu but file has size only %Lu\n",
149 (unsigned long long)OCFS2_I(inode)->ip_blkno,
150 (unsigned long long)v_block,
151 (unsigned long long)i_size_read(inode));
152 return -EIO;
153 }
154 rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0,
155 ocfs2_validate_quota_block);
156 if (rc)
157 mlog_errno(rc);
158
159 /* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */
160 if (!rc && !*bh)
161 *bh = tmp;
162
163 return rc;
164}
165
135/* Check whether we understand format of quota files */ 166/* Check whether we understand format of quota files */
136static int ocfs2_local_check_quota_file(struct super_block *sb, int type) 167static int ocfs2_local_check_quota_file(struct super_block *sb, int type)
137{ 168{
@@ -522,9 +553,7 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
522 ocfs2_clear_bit(bit, dchunk->dqc_bitmap); 553 ocfs2_clear_bit(bit, dchunk->dqc_bitmap);
523 le32_add_cpu(&dchunk->dqc_free, 1); 554 le32_add_cpu(&dchunk->dqc_free, 1);
524 unlock_buffer(qbh); 555 unlock_buffer(qbh);
525 status = ocfs2_journal_dirty(handle, qbh); 556 ocfs2_journal_dirty(handle, qbh);
526 if (status < 0)
527 mlog_errno(status);
528out_commit: 557out_commit:
529 mutex_unlock(&sb_dqopt(sb)->dqio_mutex); 558 mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
530 ocfs2_commit_trans(OCFS2_SB(sb), handle); 559 ocfs2_commit_trans(OCFS2_SB(sb), handle);
@@ -630,9 +659,7 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
630 lock_buffer(bh); 659 lock_buffer(bh);
631 ldinfo->dqi_flags = cpu_to_le32(flags | OLQF_CLEAN); 660 ldinfo->dqi_flags = cpu_to_le32(flags | OLQF_CLEAN);
632 unlock_buffer(bh); 661 unlock_buffer(bh);
633 status = ocfs2_journal_dirty(handle, bh); 662 ocfs2_journal_dirty(handle, bh);
634 if (status < 0)
635 mlog_errno(status);
636out_trans: 663out_trans:
637 ocfs2_commit_trans(osb, handle); 664 ocfs2_commit_trans(osb, handle);
638out_bh: 665out_bh:
@@ -678,7 +705,7 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
678 INIT_LIST_HEAD(&oinfo->dqi_chunk); 705 INIT_LIST_HEAD(&oinfo->dqi_chunk);
679 oinfo->dqi_rec = NULL; 706 oinfo->dqi_rec = NULL;
680 oinfo->dqi_lqi_bh = NULL; 707 oinfo->dqi_lqi_bh = NULL;
681 oinfo->dqi_ibh = NULL; 708 oinfo->dqi_libh = NULL;
682 709
683 status = ocfs2_global_read_info(sb, type); 710 status = ocfs2_global_read_info(sb, type);
684 if (status < 0) 711 if (status < 0)
@@ -704,7 +731,7 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
704 info->dqi_flags = le32_to_cpu(ldinfo->dqi_flags); 731 info->dqi_flags = le32_to_cpu(ldinfo->dqi_flags);
705 oinfo->dqi_chunks = le32_to_cpu(ldinfo->dqi_chunks); 732 oinfo->dqi_chunks = le32_to_cpu(ldinfo->dqi_chunks);
706 oinfo->dqi_blocks = le32_to_cpu(ldinfo->dqi_blocks); 733 oinfo->dqi_blocks = le32_to_cpu(ldinfo->dqi_blocks);
707 oinfo->dqi_ibh = bh; 734 oinfo->dqi_libh = bh;
708 735
709 /* We crashed when using local quota file? */ 736 /* We crashed when using local quota file? */
710 if (!(info->dqi_flags & OLQF_CLEAN)) { 737 if (!(info->dqi_flags & OLQF_CLEAN)) {
@@ -766,7 +793,7 @@ static int ocfs2_local_write_info(struct super_block *sb, int type)
766{ 793{
767 struct mem_dqinfo *info = sb_dqinfo(sb, type); 794 struct mem_dqinfo *info = sb_dqinfo(sb, type);
768 struct buffer_head *bh = ((struct ocfs2_mem_dqinfo *)info->dqi_priv) 795 struct buffer_head *bh = ((struct ocfs2_mem_dqinfo *)info->dqi_priv)
769 ->dqi_ibh; 796 ->dqi_libh;
770 int status; 797 int status;
771 798
772 status = ocfs2_modify_bh(sb_dqopt(sb)->files[type], bh, olq_update_info, 799 status = ocfs2_modify_bh(sb_dqopt(sb)->files[type], bh, olq_update_info,
@@ -789,10 +816,6 @@ static int ocfs2_local_free_info(struct super_block *sb, int type)
789 int mark_clean = 1, len; 816 int mark_clean = 1, len;
790 int status; 817 int status;
791 818
792 /* At this point we know there are no more dquots and thus
793 * even if there's some sync in the pdflush queue, it won't
794 * find any dquots and return without doing anything */
795 cancel_delayed_work_sync(&oinfo->dqi_sync_work);
796 iput(oinfo->dqi_gqinode); 819 iput(oinfo->dqi_gqinode);
797 ocfs2_simple_drop_lockres(OCFS2_SB(sb), &oinfo->dqi_gqlock); 820 ocfs2_simple_drop_lockres(OCFS2_SB(sb), &oinfo->dqi_gqlock);
798 ocfs2_lock_res_free(&oinfo->dqi_gqlock); 821 ocfs2_lock_res_free(&oinfo->dqi_gqlock);
@@ -827,7 +850,7 @@ static int ocfs2_local_free_info(struct super_block *sb, int type)
827 /* Mark local file as clean */ 850 /* Mark local file as clean */
828 info->dqi_flags |= OLQF_CLEAN; 851 info->dqi_flags |= OLQF_CLEAN;
829 status = ocfs2_modify_bh(sb_dqopt(sb)->files[type], 852 status = ocfs2_modify_bh(sb_dqopt(sb)->files[type],
830 oinfo->dqi_ibh, 853 oinfo->dqi_libh,
831 olq_update_info, 854 olq_update_info,
832 info); 855 info);
833 if (status < 0) { 856 if (status < 0) {
@@ -837,7 +860,7 @@ static int ocfs2_local_free_info(struct super_block *sb, int type)
837 860
838out: 861out:
839 ocfs2_inode_unlock(sb_dqopt(sb)->files[type], 1); 862 ocfs2_inode_unlock(sb_dqopt(sb)->files[type], 1);
840 brelse(oinfo->dqi_ibh); 863 brelse(oinfo->dqi_libh);
841 brelse(oinfo->dqi_lqi_bh); 864 brelse(oinfo->dqi_lqi_bh);
842 kfree(oinfo); 865 kfree(oinfo);
843 return 0; 866 return 0;
@@ -865,22 +888,21 @@ static void olq_set_dquot(struct buffer_head *bh, void *private)
865} 888}
866 889
867/* Write dquot to local quota file */ 890/* Write dquot to local quota file */
868static int ocfs2_local_write_dquot(struct dquot *dquot) 891int ocfs2_local_write_dquot(struct dquot *dquot)
869{ 892{
870 struct super_block *sb = dquot->dq_sb; 893 struct super_block *sb = dquot->dq_sb;
871 struct ocfs2_dquot *od = OCFS2_DQUOT(dquot); 894 struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
872 struct buffer_head *bh = NULL; 895 struct buffer_head *bh;
896 struct inode *lqinode = sb_dqopt(sb)->files[dquot->dq_type];
873 int status; 897 int status;
874 898
875 status = ocfs2_read_quota_block(sb_dqopt(sb)->files[dquot->dq_type], 899 status = ocfs2_read_quota_phys_block(lqinode, od->dq_local_phys_blk,
876 ol_dqblk_file_block(sb, od->dq_local_off), 900 &bh);
877 &bh);
878 if (status) { 901 if (status) {
879 mlog_errno(status); 902 mlog_errno(status);
880 goto out; 903 goto out;
881 } 904 }
882 status = ocfs2_modify_bh(sb_dqopt(sb)->files[dquot->dq_type], bh, 905 status = ocfs2_modify_bh(lqinode, bh, olq_set_dquot, od);
883 olq_set_dquot, od);
884 if (status < 0) { 906 if (status < 0) {
885 mlog_errno(status); 907 mlog_errno(status);
886 goto out; 908 goto out;
@@ -980,10 +1002,8 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
980 } 1002 }
981 1003
982 /* Initialize chunk header */ 1004 /* Initialize chunk header */
983 down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
984 status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks, 1005 status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks,
985 &p_blkno, NULL, NULL); 1006 &p_blkno, NULL, NULL);
986 up_read(&OCFS2_I(lqinode)->ip_alloc_sem);
987 if (status < 0) { 1007 if (status < 0) {
988 mlog_errno(status); 1008 mlog_errno(status);
989 goto out_trans; 1009 goto out_trans;
@@ -1008,17 +1028,11 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
1008 sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) - 1028 sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) -
1009 OCFS2_QBLK_RESERVED_SPACE); 1029 OCFS2_QBLK_RESERVED_SPACE);
1010 unlock_buffer(bh); 1030 unlock_buffer(bh);
1011 status = ocfs2_journal_dirty(handle, bh); 1031 ocfs2_journal_dirty(handle, bh);
1012 if (status < 0) {
1013 mlog_errno(status);
1014 goto out_trans;
1015 }
1016 1032
1017 /* Initialize new block with structures */ 1033 /* Initialize new block with structures */
1018 down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
1019 status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks + 1, 1034 status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks + 1,
1020 &p_blkno, NULL, NULL); 1035 &p_blkno, NULL, NULL);
1021 up_read(&OCFS2_I(lqinode)->ip_alloc_sem);
1022 if (status < 0) { 1036 if (status < 0) {
1023 mlog_errno(status); 1037 mlog_errno(status);
1024 goto out_trans; 1038 goto out_trans;
@@ -1039,11 +1053,7 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
1039 lock_buffer(dbh); 1053 lock_buffer(dbh);
1040 memset(dbh->b_data, 0, sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE); 1054 memset(dbh->b_data, 0, sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE);
1041 unlock_buffer(dbh); 1055 unlock_buffer(dbh);
1042 status = ocfs2_journal_dirty(handle, dbh); 1056 ocfs2_journal_dirty(handle, dbh);
1043 if (status < 0) {
1044 mlog_errno(status);
1045 goto out_trans;
1046 }
1047 1057
1048 /* Update local quotafile info */ 1058 /* Update local quotafile info */
1049 oinfo->dqi_blocks += 2; 1059 oinfo->dqi_blocks += 2;
@@ -1119,10 +1129,8 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
1119 } 1129 }
1120 1130
1121 /* Get buffer from the just added block */ 1131 /* Get buffer from the just added block */
1122 down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
1123 status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks, 1132 status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks,
1124 &p_blkno, NULL, NULL); 1133 &p_blkno, NULL, NULL);
1125 up_read(&OCFS2_I(lqinode)->ip_alloc_sem);
1126 if (status < 0) { 1134 if (status < 0) {
1127 mlog_errno(status); 1135 mlog_errno(status);
1128 goto out; 1136 goto out;
@@ -1154,11 +1162,8 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
1154 lock_buffer(bh); 1162 lock_buffer(bh);
1155 memset(bh->b_data, 0, sb->s_blocksize); 1163 memset(bh->b_data, 0, sb->s_blocksize);
1156 unlock_buffer(bh); 1164 unlock_buffer(bh);
1157 status = ocfs2_journal_dirty(handle, bh); 1165 ocfs2_journal_dirty(handle, bh);
1158 if (status < 0) { 1166
1159 mlog_errno(status);
1160 goto out_trans;
1161 }
1162 /* Update chunk header */ 1167 /* Update chunk header */
1163 status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode), 1168 status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode),
1164 chunk->qc_headerbh, 1169 chunk->qc_headerbh,
@@ -1172,11 +1177,8 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
1172 lock_buffer(chunk->qc_headerbh); 1177 lock_buffer(chunk->qc_headerbh);
1173 le32_add_cpu(&dchunk->dqc_free, ol_quota_entries_per_block(sb)); 1178 le32_add_cpu(&dchunk->dqc_free, ol_quota_entries_per_block(sb));
1174 unlock_buffer(chunk->qc_headerbh); 1179 unlock_buffer(chunk->qc_headerbh);
1175 status = ocfs2_journal_dirty(handle, chunk->qc_headerbh); 1180 ocfs2_journal_dirty(handle, chunk->qc_headerbh);
1176 if (status < 0) { 1181
1177 mlog_errno(status);
1178 goto out_trans;
1179 }
1180 /* Update file header */ 1182 /* Update file header */
1181 oinfo->dqi_blocks++; 1183 oinfo->dqi_blocks++;
1182 status = ocfs2_local_write_info(sb, type); 1184 status = ocfs2_local_write_info(sb, type);
@@ -1209,7 +1211,7 @@ static void olq_alloc_dquot(struct buffer_head *bh, void *private)
1209} 1211}
1210 1212
1211/* Create dquot in the local file for given id */ 1213/* Create dquot in the local file for given id */
1212static int ocfs2_create_local_dquot(struct dquot *dquot) 1214int ocfs2_create_local_dquot(struct dquot *dquot)
1213{ 1215{
1214 struct super_block *sb = dquot->dq_sb; 1216 struct super_block *sb = dquot->dq_sb;
1215 int type = dquot->dq_type; 1217 int type = dquot->dq_type;
@@ -1218,17 +1220,27 @@ static int ocfs2_create_local_dquot(struct dquot *dquot)
1218 struct ocfs2_dquot *od = OCFS2_DQUOT(dquot); 1220 struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
1219 int offset; 1221 int offset;
1220 int status; 1222 int status;
1223 u64 pcount;
1221 1224
1225 down_write(&OCFS2_I(lqinode)->ip_alloc_sem);
1222 chunk = ocfs2_find_free_entry(sb, type, &offset); 1226 chunk = ocfs2_find_free_entry(sb, type, &offset);
1223 if (!chunk) { 1227 if (!chunk) {
1224 chunk = ocfs2_extend_local_quota_file(sb, type, &offset); 1228 chunk = ocfs2_extend_local_quota_file(sb, type, &offset);
1225 if (IS_ERR(chunk)) 1229 if (IS_ERR(chunk)) {
1226 return PTR_ERR(chunk); 1230 status = PTR_ERR(chunk);
1231 goto out;
1232 }
1227 } else if (IS_ERR(chunk)) { 1233 } else if (IS_ERR(chunk)) {
1228 return PTR_ERR(chunk); 1234 status = PTR_ERR(chunk);
1235 goto out;
1229 } 1236 }
1230 od->dq_local_off = ol_dqblk_off(sb, chunk->qc_num, offset); 1237 od->dq_local_off = ol_dqblk_off(sb, chunk->qc_num, offset);
1231 od->dq_chunk = chunk; 1238 od->dq_chunk = chunk;
1239 status = ocfs2_extent_map_get_blocks(lqinode,
1240 ol_dqblk_block(sb, chunk->qc_num, offset),
1241 &od->dq_local_phys_blk,
1242 &pcount,
1243 NULL);
1232 1244
1233 /* Initialize dquot structure on disk */ 1245 /* Initialize dquot structure on disk */
1234 status = ocfs2_local_write_dquot(dquot); 1246 status = ocfs2_local_write_dquot(dquot);
@@ -1245,39 +1257,15 @@ static int ocfs2_create_local_dquot(struct dquot *dquot)
1245 goto out; 1257 goto out;
1246 } 1258 }
1247out: 1259out:
1260 up_write(&OCFS2_I(lqinode)->ip_alloc_sem);
1248 return status; 1261 return status;
1249} 1262}
1250 1263
1251/* Create entry in local file for dquot, load data from the global file */ 1264/*
1252static int ocfs2_local_read_dquot(struct dquot *dquot) 1265 * Release dquot structure from local quota file. ocfs2_release_dquot() has
1253{ 1266 * already started a transaction and written all changes to global quota file
1254 int status; 1267 */
1255 1268int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot)
1256 mlog_entry("id=%u, type=%d\n", dquot->dq_id, dquot->dq_type);
1257
1258 status = ocfs2_global_read_dquot(dquot);
1259 if (status < 0) {
1260 mlog_errno(status);
1261 goto out_err;
1262 }
1263
1264 /* Now create entry in the local quota file */
1265 status = ocfs2_create_local_dquot(dquot);
1266 if (status < 0) {
1267 mlog_errno(status);
1268 goto out_err;
1269 }
1270 mlog_exit(0);
1271 return 0;
1272out_err:
1273 mlog_exit(status);
1274 return status;
1275}
1276
1277/* Release dquot structure from local quota file. ocfs2_release_dquot() has
1278 * already started a transaction and obtained exclusive lock for global
1279 * quota file. */
1280static int ocfs2_local_release_dquot(struct dquot *dquot)
1281{ 1269{
1282 int status; 1270 int status;
1283 int type = dquot->dq_type; 1271 int type = dquot->dq_type;
@@ -1285,15 +1273,6 @@ static int ocfs2_local_release_dquot(struct dquot *dquot)
1285 struct super_block *sb = dquot->dq_sb; 1273 struct super_block *sb = dquot->dq_sb;
1286 struct ocfs2_local_disk_chunk *dchunk; 1274 struct ocfs2_local_disk_chunk *dchunk;
1287 int offset; 1275 int offset;
1288 handle_t *handle = journal_current_handle();
1289
1290 BUG_ON(!handle);
1291 /* First write all local changes to global file */
1292 status = ocfs2_global_release_dquot(dquot);
1293 if (status < 0) {
1294 mlog_errno(status);
1295 goto out;
1296 }
1297 1276
1298 status = ocfs2_journal_access_dq(handle, 1277 status = ocfs2_journal_access_dq(handle,
1299 INODE_CACHE(sb_dqopt(sb)->files[type]), 1278 INODE_CACHE(sb_dqopt(sb)->files[type]),
@@ -1311,12 +1290,8 @@ static int ocfs2_local_release_dquot(struct dquot *dquot)
1311 ocfs2_clear_bit(offset, dchunk->dqc_bitmap); 1290 ocfs2_clear_bit(offset, dchunk->dqc_bitmap);
1312 le32_add_cpu(&dchunk->dqc_free, 1); 1291 le32_add_cpu(&dchunk->dqc_free, 1);
1313 unlock_buffer(od->dq_chunk->qc_headerbh); 1292 unlock_buffer(od->dq_chunk->qc_headerbh);
1314 status = ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh); 1293 ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh);
1315 if (status < 0) { 1294
1316 mlog_errno(status);
1317 goto out;
1318 }
1319 status = 0;
1320out: 1295out:
1321 /* Clear the read bit so that next time someone uses this 1296 /* Clear the read bit so that next time someone uses this
1322 * dquot he reads fresh info from disk and allocates local 1297 * dquot he reads fresh info from disk and allocates local
@@ -1330,9 +1305,6 @@ static const struct quota_format_ops ocfs2_format_ops = {
1330 .read_file_info = ocfs2_local_read_info, 1305 .read_file_info = ocfs2_local_read_info,
1331 .write_file_info = ocfs2_global_write_info, 1306 .write_file_info = ocfs2_global_write_info,
1332 .free_file_info = ocfs2_local_free_info, 1307 .free_file_info = ocfs2_local_free_info,
1333 .read_dqblk = ocfs2_local_read_dquot,
1334 .commit_dqblk = ocfs2_local_write_dquot,
1335 .release_dqblk = ocfs2_local_release_dquot,
1336}; 1308};
1337 1309
1338struct quota_format_type ocfs2_quota_format = { 1310struct quota_format_type ocfs2_quota_format = {
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 9e96921dffda..4793f36f6518 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -37,7 +37,6 @@
37 37
38#include <linux/bio.h> 38#include <linux/bio.h>
39#include <linux/blkdev.h> 39#include <linux/blkdev.h>
40#include <linux/gfp.h>
41#include <linux/slab.h> 40#include <linux/slab.h>
42#include <linux/writeback.h> 41#include <linux/writeback.h>
43#include <linux/pagevec.h> 42#include <linux/pagevec.h>
@@ -571,7 +570,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
571 struct ocfs2_refcount_tree *new_tree = NULL, *tree = NULL; 570 struct ocfs2_refcount_tree *new_tree = NULL, *tree = NULL;
572 u16 suballoc_bit_start; 571 u16 suballoc_bit_start;
573 u32 num_got; 572 u32 num_got;
574 u64 first_blkno; 573 u64 suballoc_loc, first_blkno;
575 574
576 BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL); 575 BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL);
577 576
@@ -597,7 +596,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
597 goto out_commit; 596 goto out_commit;
598 } 597 }
599 598
600 ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1, 599 ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
601 &suballoc_bit_start, &num_got, 600 &suballoc_bit_start, &num_got,
602 &first_blkno); 601 &first_blkno);
603 if (ret) { 602 if (ret) {
@@ -627,6 +626,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
627 memset(rb, 0, inode->i_sb->s_blocksize); 626 memset(rb, 0, inode->i_sb->s_blocksize);
628 strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE); 627 strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
629 rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot); 628 rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
629 rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
630 rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); 630 rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
631 rb->rf_fs_generation = cpu_to_le32(osb->fs_generation); 631 rb->rf_fs_generation = cpu_to_le32(osb->fs_generation);
632 rb->rf_blkno = cpu_to_le64(first_blkno); 632 rb->rf_blkno = cpu_to_le64(first_blkno);
@@ -791,7 +791,10 @@ int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh)
791 if (le32_to_cpu(rb->rf_count) == 1) { 791 if (le32_to_cpu(rb->rf_count) == 1) {
792 blk = le64_to_cpu(rb->rf_blkno); 792 blk = le64_to_cpu(rb->rf_blkno);
793 bit = le16_to_cpu(rb->rf_suballoc_bit); 793 bit = le16_to_cpu(rb->rf_suballoc_bit);
794 bg_blkno = ocfs2_which_suballoc_group(blk, bit); 794 if (rb->rf_suballoc_loc)
795 bg_blkno = le64_to_cpu(rb->rf_suballoc_loc);
796 else
797 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
795 798
796 alloc_inode = ocfs2_get_system_file_inode(osb, 799 alloc_inode = ocfs2_get_system_file_inode(osb,
797 EXTENT_ALLOC_SYSTEM_INODE, 800 EXTENT_ALLOC_SYSTEM_INODE,
@@ -1269,9 +1272,7 @@ static int ocfs2_change_refcount_rec(handle_t *handle,
1269 } else if (merge) 1272 } else if (merge)
1270 ocfs2_refcount_rec_merge(rb, index); 1273 ocfs2_refcount_rec_merge(rb, index);
1271 1274
1272 ret = ocfs2_journal_dirty(handle, ref_leaf_bh); 1275 ocfs2_journal_dirty(handle, ref_leaf_bh);
1273 if (ret)
1274 mlog_errno(ret);
1275out: 1276out:
1276 return ret; 1277 return ret;
1277} 1278}
@@ -1285,7 +1286,7 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle,
1285 int ret; 1286 int ret;
1286 u16 suballoc_bit_start; 1287 u16 suballoc_bit_start;
1287 u32 num_got; 1288 u32 num_got;
1288 u64 blkno; 1289 u64 suballoc_loc, blkno;
1289 struct super_block *sb = ocfs2_metadata_cache_get_super(ci); 1290 struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
1290 struct buffer_head *new_bh = NULL; 1291 struct buffer_head *new_bh = NULL;
1291 struct ocfs2_refcount_block *new_rb; 1292 struct ocfs2_refcount_block *new_rb;
@@ -1299,7 +1300,7 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle,
1299 goto out; 1300 goto out;
1300 } 1301 }
1301 1302
1302 ret = ocfs2_claim_metadata(OCFS2_SB(sb), handle, meta_ac, 1, 1303 ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
1303 &suballoc_bit_start, &num_got, 1304 &suballoc_bit_start, &num_got,
1304 &blkno); 1305 &blkno);
1305 if (ret) { 1306 if (ret) {
@@ -1331,6 +1332,7 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle,
1331 1332
1332 new_rb = (struct ocfs2_refcount_block *)new_bh->b_data; 1333 new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
1333 new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot); 1334 new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
1335 new_rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
1334 new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); 1336 new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
1335 new_rb->rf_blkno = cpu_to_le64(blkno); 1337 new_rb->rf_blkno = cpu_to_le64(blkno);
1336 new_rb->rf_cpos = cpu_to_le32(0); 1338 new_rb->rf_cpos = cpu_to_le32(0);
@@ -1525,7 +1527,7 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle,
1525 int ret; 1527 int ret;
1526 u16 suballoc_bit_start; 1528 u16 suballoc_bit_start;
1527 u32 num_got, new_cpos; 1529 u32 num_got, new_cpos;
1528 u64 blkno; 1530 u64 suballoc_loc, blkno;
1529 struct super_block *sb = ocfs2_metadata_cache_get_super(ci); 1531 struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
1530 struct ocfs2_refcount_block *root_rb = 1532 struct ocfs2_refcount_block *root_rb =
1531 (struct ocfs2_refcount_block *)ref_root_bh->b_data; 1533 (struct ocfs2_refcount_block *)ref_root_bh->b_data;
@@ -1549,7 +1551,7 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle,
1549 goto out; 1551 goto out;
1550 } 1552 }
1551 1553
1552 ret = ocfs2_claim_metadata(OCFS2_SB(sb), handle, meta_ac, 1, 1554 ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
1553 &suballoc_bit_start, &num_got, 1555 &suballoc_bit_start, &num_got,
1554 &blkno); 1556 &blkno);
1555 if (ret) { 1557 if (ret) {
@@ -1577,6 +1579,7 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle,
1577 memset(new_rb, 0, sb->s_blocksize); 1579 memset(new_rb, 0, sb->s_blocksize);
1578 strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE); 1580 strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
1579 new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot); 1581 new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
1582 new_rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
1580 new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); 1583 new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
1581 new_rb->rf_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation); 1584 new_rb->rf_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
1582 new_rb->rf_blkno = cpu_to_le64(blkno); 1585 new_rb->rf_blkno = cpu_to_le64(blkno);
@@ -1695,7 +1698,7 @@ static int ocfs2_adjust_refcount_rec(handle_t *handle,
1695 * 2 more credits, one for the leaf refcount block, one for 1698 * 2 more credits, one for the leaf refcount block, one for
1696 * the extent block contains the extent rec. 1699 * the extent block contains the extent rec.
1697 */ 1700 */
1698 ret = ocfs2_extend_trans(handle, handle->h_buffer_credits + 2); 1701 ret = ocfs2_extend_trans(handle, 2);
1699 if (ret < 0) { 1702 if (ret < 0) {
1700 mlog_errno(ret); 1703 mlog_errno(ret);
1701 goto out; 1704 goto out;
@@ -1803,11 +1806,7 @@ static int ocfs2_insert_refcount_rec(handle_t *handle,
1803 if (merge) 1806 if (merge)
1804 ocfs2_refcount_rec_merge(rb, index); 1807 ocfs2_refcount_rec_merge(rb, index);
1805 1808
1806 ret = ocfs2_journal_dirty(handle, ref_leaf_bh); 1809 ocfs2_journal_dirty(handle, ref_leaf_bh);
1807 if (ret) {
1808 mlog_errno(ret);
1809 goto out;
1810 }
1811 1810
1812 if (index == 0) { 1811 if (index == 0) {
1813 ret = ocfs2_adjust_refcount_rec(handle, ci, 1812 ret = ocfs2_adjust_refcount_rec(handle, ci,
@@ -1978,9 +1977,7 @@ static int ocfs2_split_refcount_rec(handle_t *handle,
1978 ocfs2_refcount_rec_merge(rb, index); 1977 ocfs2_refcount_rec_merge(rb, index);
1979 } 1978 }
1980 1979
1981 ret = ocfs2_journal_dirty(handle, ref_leaf_bh); 1980 ocfs2_journal_dirty(handle, ref_leaf_bh);
1982 if (ret)
1983 mlog_errno(ret);
1984 1981
1985out: 1982out:
1986 brelse(new_bh); 1983 brelse(new_bh);
@@ -2113,6 +2110,7 @@ static int ocfs2_remove_refcount_extent(handle_t *handle,
2113 */ 2110 */
2114 ret = ocfs2_cache_block_dealloc(dealloc, EXTENT_ALLOC_SYSTEM_INODE, 2111 ret = ocfs2_cache_block_dealloc(dealloc, EXTENT_ALLOC_SYSTEM_INODE,
2115 le16_to_cpu(rb->rf_suballoc_slot), 2112 le16_to_cpu(rb->rf_suballoc_slot),
2113 le64_to_cpu(rb->rf_suballoc_loc),
2116 le64_to_cpu(rb->rf_blkno), 2114 le64_to_cpu(rb->rf_blkno),
2117 le16_to_cpu(rb->rf_suballoc_bit)); 2115 le16_to_cpu(rb->rf_suballoc_bit));
2118 if (ret) { 2116 if (ret) {
@@ -2517,20 +2515,19 @@ out:
2517 * 2515 *
2518 * Normally the refcount blocks store these refcount should be 2516 * Normally the refcount blocks store these refcount should be
2519 * contiguous also, so that we can get the number easily. 2517 * contiguous also, so that we can get the number easily.
2520 * As for meta_ac, we will at most add split 2 refcount record and 2518 * We will at most add split 2 refcount records and 2 more
2521 * 2 more refcount block, so just check it in a rough way. 2519 * refcount blocks, so just check it in a rough way.
2522 * 2520 *
2523 * Caller must hold refcount tree lock. 2521 * Caller must hold refcount tree lock.
2524 */ 2522 */
2525int ocfs2_prepare_refcount_change_for_del(struct inode *inode, 2523int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
2526 struct buffer_head *di_bh, 2524 u64 refcount_loc,
2527 u64 phys_blkno, 2525 u64 phys_blkno,
2528 u32 clusters, 2526 u32 clusters,
2529 int *credits, 2527 int *credits,
2530 struct ocfs2_alloc_context **meta_ac) 2528 int *ref_blocks)
2531{ 2529{
2532 int ret, ref_blocks = 0; 2530 int ret;
2533 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
2534 struct ocfs2_inode_info *oi = OCFS2_I(inode); 2531 struct ocfs2_inode_info *oi = OCFS2_I(inode);
2535 struct buffer_head *ref_root_bh = NULL; 2532 struct buffer_head *ref_root_bh = NULL;
2536 struct ocfs2_refcount_tree *tree; 2533 struct ocfs2_refcount_tree *tree;
@@ -2547,14 +2544,13 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
2547 BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); 2544 BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
2548 2545
2549 ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb), 2546 ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb),
2550 le64_to_cpu(di->i_refcount_loc), &tree); 2547 refcount_loc, &tree);
2551 if (ret) { 2548 if (ret) {
2552 mlog_errno(ret); 2549 mlog_errno(ret);
2553 goto out; 2550 goto out;
2554 } 2551 }
2555 2552
2556 ret = ocfs2_read_refcount_block(&tree->rf_ci, 2553 ret = ocfs2_read_refcount_block(&tree->rf_ci, refcount_loc,
2557 le64_to_cpu(di->i_refcount_loc),
2558 &ref_root_bh); 2554 &ref_root_bh);
2559 if (ret) { 2555 if (ret) {
2560 mlog_errno(ret); 2556 mlog_errno(ret);
@@ -2565,21 +2561,14 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
2565 &tree->rf_ci, 2561 &tree->rf_ci,
2566 ref_root_bh, 2562 ref_root_bh,
2567 start_cpos, clusters, 2563 start_cpos, clusters,
2568 &ref_blocks, credits); 2564 ref_blocks, credits);
2569 if (ret) { 2565 if (ret) {
2570 mlog_errno(ret); 2566 mlog_errno(ret);
2571 goto out; 2567 goto out;
2572 } 2568 }
2573 2569
2574 mlog(0, "reserve new metadata %d, credits = %d\n", 2570 mlog(0, "reserve new metadata %d blocks, credits = %d\n",
2575 ref_blocks, *credits); 2571 *ref_blocks, *credits);
2576
2577 if (ref_blocks) {
2578 ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb),
2579 ref_blocks, meta_ac);
2580 if (ret)
2581 mlog_errno(ret);
2582 }
2583 2572
2584out: 2573out:
2585 brelse(ref_root_bh); 2574 brelse(ref_root_bh);
@@ -3041,11 +3030,7 @@ static int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
3041 } 3030 }
3042 3031
3043 memcpy(new_bh->b_data, old_bh->b_data, sb->s_blocksize); 3032 memcpy(new_bh->b_data, old_bh->b_data, sb->s_blocksize);
3044 ret = ocfs2_journal_dirty(handle, new_bh); 3033 ocfs2_journal_dirty(handle, new_bh);
3045 if (ret) {
3046 mlog_errno(ret);
3047 break;
3048 }
3049 3034
3050 brelse(new_bh); 3035 brelse(new_bh);
3051 brelse(old_bh); 3036 brelse(old_bh);
@@ -3283,7 +3268,7 @@ static int ocfs2_make_clusters_writable(struct super_block *sb,
3283 } else { 3268 } else {
3284 delete = 1; 3269 delete = 1;
3285 3270
3286 ret = __ocfs2_claim_clusters(osb, handle, 3271 ret = __ocfs2_claim_clusters(handle,
3287 context->data_ac, 3272 context->data_ac,
3288 1, set_len, 3273 1, set_len,
3289 &new_bit, &new_len); 3274 &new_bit, &new_len);
@@ -4075,6 +4060,7 @@ static int ocfs2_complete_reflink(struct inode *s_inode,
4075 OCFS2_I(t_inode)->ip_dyn_features = OCFS2_I(s_inode)->ip_dyn_features; 4060 OCFS2_I(t_inode)->ip_dyn_features = OCFS2_I(s_inode)->ip_dyn_features;
4076 spin_unlock(&OCFS2_I(t_inode)->ip_lock); 4061 spin_unlock(&OCFS2_I(t_inode)->ip_lock);
4077 i_size_write(t_inode, size); 4062 i_size_write(t_inode, size);
4063 t_inode->i_blocks = s_inode->i_blocks;
4078 4064
4079 di->i_xattr_inline_size = s_di->i_xattr_inline_size; 4065 di->i_xattr_inline_size = s_di->i_xattr_inline_size;
4080 di->i_clusters = s_di->i_clusters; 4066 di->i_clusters = s_di->i_clusters;
@@ -4083,6 +4069,9 @@ static int ocfs2_complete_reflink(struct inode *s_inode,
4083 di->i_attr = s_di->i_attr; 4069 di->i_attr = s_di->i_attr;
4084 4070
4085 if (preserve) { 4071 if (preserve) {
4072 t_inode->i_uid = s_inode->i_uid;
4073 t_inode->i_gid = s_inode->i_gid;
4074 t_inode->i_mode = s_inode->i_mode;
4086 di->i_uid = s_di->i_uid; 4075 di->i_uid = s_di->i_uid;
4087 di->i_gid = s_di->i_gid; 4076 di->i_gid = s_di->i_gid;
4088 di->i_mode = s_di->i_mode; 4077 di->i_mode = s_di->i_mode;
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index c1d19b1d3ecc..9983ba1570e2 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -47,11 +47,11 @@ int ocfs2_decrease_refcount(struct inode *inode,
47 struct ocfs2_cached_dealloc_ctxt *dealloc, 47 struct ocfs2_cached_dealloc_ctxt *dealloc,
48 int delete); 48 int delete);
49int ocfs2_prepare_refcount_change_for_del(struct inode *inode, 49int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
50 struct buffer_head *di_bh, 50 u64 refcount_loc,
51 u64 phys_blkno, 51 u64 phys_blkno,
52 u32 clusters, 52 u32 clusters,
53 int *credits, 53 int *credits,
54 struct ocfs2_alloc_context **meta_ac); 54 int *ref_blocks);
55int ocfs2_refcount_cow(struct inode *inode, struct buffer_head *di_bh, 55int ocfs2_refcount_cow(struct inode *inode, struct buffer_head *di_bh,
56 u32 cpos, u32 write_len, u32 max_cpos); 56 u32 cpos, u32 write_len, u32 max_cpos);
57 57
diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c
new file mode 100644
index 000000000000..40650021fc24
--- /dev/null
+++ b/fs/ocfs2/reservations.c
@@ -0,0 +1,847 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * reservations.c
5 *
6 * Allocation reservations implementation
7 *
8 * Some code borrowed from fs/ext3/balloc.c and is:
9 *
10 * Copyright (C) 1992, 1993, 1994, 1995
11 * Remy Card (card@masi.ibp.fr)
12 * Laboratoire MASI - Institut Blaise Pascal
13 * Universite Pierre et Marie Curie (Paris VI)
14 *
15 * The rest is copyright (C) 2010 Novell. All rights reserved.
16 *
17 * This program is free software; you can redistribute it and/or
18 * modify it under the terms of the GNU General Public
19 * License version 2 as published by the Free Software Foundation.
20 *
21 * This program is distributed in the hope that it will be useful,
22 * but WITHOUT ANY WARRANTY; without even the implied warranty of
23 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
24 * General Public License for more details.
25 */
26
27#include <linux/fs.h>
28#include <linux/types.h>
29#include <linux/slab.h>
30#include <linux/highmem.h>
31#include <linux/bitops.h>
32#include <linux/list.h>
33
34#define MLOG_MASK_PREFIX ML_RESERVATIONS
35#include <cluster/masklog.h>
36
37#include "ocfs2.h"
38
39#ifdef CONFIG_OCFS2_DEBUG_FS
40#define OCFS2_CHECK_RESERVATIONS
41#endif
42
43DEFINE_SPINLOCK(resv_lock);
44
45#define OCFS2_MIN_RESV_WINDOW_BITS 8
46#define OCFS2_MAX_RESV_WINDOW_BITS 1024
47
48int ocfs2_dir_resv_allowed(struct ocfs2_super *osb)
49{
50 return (osb->osb_resv_level && osb->osb_dir_resv_level);
51}
52
53static unsigned int ocfs2_resv_window_bits(struct ocfs2_reservation_map *resmap,
54 struct ocfs2_alloc_reservation *resv)
55{
56 struct ocfs2_super *osb = resmap->m_osb;
57 unsigned int bits;
58
59 if (!(resv->r_flags & OCFS2_RESV_FLAG_DIR)) {
60 /* 8, 16, 32, 64, 128, 256, 512, 1024 */
61 bits = 4 << osb->osb_resv_level;
62 } else {
63 bits = 4 << osb->osb_dir_resv_level;
64 }
65 return bits;
66}
67
68static inline unsigned int ocfs2_resv_end(struct ocfs2_alloc_reservation *resv)
69{
70 if (resv->r_len)
71 return resv->r_start + resv->r_len - 1;
72 return resv->r_start;
73}
74
75static inline int ocfs2_resv_empty(struct ocfs2_alloc_reservation *resv)
76{
77 return !!(resv->r_len == 0);
78}
79
80static inline int ocfs2_resmap_disabled(struct ocfs2_reservation_map *resmap)
81{
82 if (resmap->m_osb->osb_resv_level == 0)
83 return 1;
84 return 0;
85}
86
87static void ocfs2_dump_resv(struct ocfs2_reservation_map *resmap)
88{
89 struct ocfs2_super *osb = resmap->m_osb;
90 struct rb_node *node;
91 struct ocfs2_alloc_reservation *resv;
92 int i = 0;
93
94 mlog(ML_NOTICE, "Dumping resmap for device %s. Bitmap length: %u\n",
95 osb->dev_str, resmap->m_bitmap_len);
96
97 node = rb_first(&resmap->m_reservations);
98 while (node) {
99 resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
100
101 mlog(ML_NOTICE, "start: %u\tend: %u\tlen: %u\tlast_start: %u"
102 "\tlast_len: %u\n", resv->r_start,
103 ocfs2_resv_end(resv), resv->r_len, resv->r_last_start,
104 resv->r_last_len);
105
106 node = rb_next(node);
107 i++;
108 }
109
110 mlog(ML_NOTICE, "%d reservations found. LRU follows\n", i);
111
112 i = 0;
113 list_for_each_entry(resv, &resmap->m_lru, r_lru) {
114 mlog(ML_NOTICE, "LRU(%d) start: %u\tend: %u\tlen: %u\t"
115 "last_start: %u\tlast_len: %u\n", i, resv->r_start,
116 ocfs2_resv_end(resv), resv->r_len, resv->r_last_start,
117 resv->r_last_len);
118
119 i++;
120 }
121}
122
123#ifdef OCFS2_CHECK_RESERVATIONS
124static int ocfs2_validate_resmap_bits(struct ocfs2_reservation_map *resmap,
125 int i,
126 struct ocfs2_alloc_reservation *resv)
127{
128 char *disk_bitmap = resmap->m_disk_bitmap;
129 unsigned int start = resv->r_start;
130 unsigned int end = ocfs2_resv_end(resv);
131
132 while (start <= end) {
133 if (ocfs2_test_bit(start, disk_bitmap)) {
134 mlog(ML_ERROR,
135 "reservation %d covers an allocated area "
136 "starting at bit %u!\n", i, start);
137 return 1;
138 }
139
140 start++;
141 }
142 return 0;
143}
144
145static void ocfs2_check_resmap(struct ocfs2_reservation_map *resmap)
146{
147 unsigned int off = 0;
148 int i = 0;
149 struct rb_node *node;
150 struct ocfs2_alloc_reservation *resv;
151
152 node = rb_first(&resmap->m_reservations);
153 while (node) {
154 resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
155
156 if (i > 0 && resv->r_start <= off) {
157 mlog(ML_ERROR, "reservation %d has bad start off!\n",
158 i);
159 goto bad;
160 }
161
162 if (resv->r_len == 0) {
163 mlog(ML_ERROR, "reservation %d has no length!\n",
164 i);
165 goto bad;
166 }
167
168 if (resv->r_start > ocfs2_resv_end(resv)) {
169 mlog(ML_ERROR, "reservation %d has invalid range!\n",
170 i);
171 goto bad;
172 }
173
174 if (ocfs2_resv_end(resv) >= resmap->m_bitmap_len) {
175 mlog(ML_ERROR, "reservation %d extends past bitmap!\n",
176 i);
177 goto bad;
178 }
179
180 if (ocfs2_validate_resmap_bits(resmap, i, resv))
181 goto bad;
182
183 off = ocfs2_resv_end(resv);
184 node = rb_next(node);
185
186 i++;
187 }
188 return;
189
190bad:
191 ocfs2_dump_resv(resmap);
192 BUG();
193}
194#else
195static inline void ocfs2_check_resmap(struct ocfs2_reservation_map *resmap)
196{
197
198}
199#endif
200
201void ocfs2_resv_init_once(struct ocfs2_alloc_reservation *resv)
202{
203 memset(resv, 0, sizeof(*resv));
204 INIT_LIST_HEAD(&resv->r_lru);
205}
206
207void ocfs2_resv_set_type(struct ocfs2_alloc_reservation *resv,
208 unsigned int flags)
209{
210 BUG_ON(flags & ~OCFS2_RESV_TYPES);
211
212 resv->r_flags |= flags;
213}
214
215int ocfs2_resmap_init(struct ocfs2_super *osb,
216 struct ocfs2_reservation_map *resmap)
217{
218 memset(resmap, 0, sizeof(*resmap));
219
220 resmap->m_osb = osb;
221 resmap->m_reservations = RB_ROOT;
222 /* m_bitmap_len is initialized to zero by the above memset. */
223 INIT_LIST_HEAD(&resmap->m_lru);
224
225 return 0;
226}
227
228static void ocfs2_resv_mark_lru(struct ocfs2_reservation_map *resmap,
229 struct ocfs2_alloc_reservation *resv)
230{
231 assert_spin_locked(&resv_lock);
232
233 if (!list_empty(&resv->r_lru))
234 list_del_init(&resv->r_lru);
235
236 list_add_tail(&resv->r_lru, &resmap->m_lru);
237}
238
239static void __ocfs2_resv_trunc(struct ocfs2_alloc_reservation *resv)
240{
241 resv->r_len = 0;
242 resv->r_start = 0;
243}
244
245static void ocfs2_resv_remove(struct ocfs2_reservation_map *resmap,
246 struct ocfs2_alloc_reservation *resv)
247{
248 if (resv->r_flags & OCFS2_RESV_FLAG_INUSE) {
249 list_del_init(&resv->r_lru);
250 rb_erase(&resv->r_node, &resmap->m_reservations);
251 resv->r_flags &= ~OCFS2_RESV_FLAG_INUSE;
252 }
253}
254
255static void __ocfs2_resv_discard(struct ocfs2_reservation_map *resmap,
256 struct ocfs2_alloc_reservation *resv)
257{
258 assert_spin_locked(&resv_lock);
259
260 __ocfs2_resv_trunc(resv);
261 /*
262 * last_len and last_start no longer make sense if
263 * we're changing the range of our allocations.
264 */
265 resv->r_last_len = resv->r_last_start = 0;
266
267 ocfs2_resv_remove(resmap, resv);
268}
269
270/* does nothing if 'resv' is null */
271void ocfs2_resv_discard(struct ocfs2_reservation_map *resmap,
272 struct ocfs2_alloc_reservation *resv)
273{
274 if (resv) {
275 spin_lock(&resv_lock);
276 __ocfs2_resv_discard(resmap, resv);
277 spin_unlock(&resv_lock);
278 }
279}
280
281static void ocfs2_resmap_clear_all_resv(struct ocfs2_reservation_map *resmap)
282{
283 struct rb_node *node;
284 struct ocfs2_alloc_reservation *resv;
285
286 assert_spin_locked(&resv_lock);
287
288 while ((node = rb_last(&resmap->m_reservations)) != NULL) {
289 resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
290
291 __ocfs2_resv_discard(resmap, resv);
292 }
293}
294
295void ocfs2_resmap_restart(struct ocfs2_reservation_map *resmap,
296 unsigned int clen, char *disk_bitmap)
297{
298 if (ocfs2_resmap_disabled(resmap))
299 return;
300
301 spin_lock(&resv_lock);
302
303 ocfs2_resmap_clear_all_resv(resmap);
304 resmap->m_bitmap_len = clen;
305 resmap->m_disk_bitmap = disk_bitmap;
306
307 spin_unlock(&resv_lock);
308}
309
310void ocfs2_resmap_uninit(struct ocfs2_reservation_map *resmap)
311{
312 /* Does nothing for now. Keep this around for API symmetry */
313}
314
315static void ocfs2_resv_insert(struct ocfs2_reservation_map *resmap,
316 struct ocfs2_alloc_reservation *new)
317{
318 struct rb_root *root = &resmap->m_reservations;
319 struct rb_node *parent = NULL;
320 struct rb_node **p = &root->rb_node;
321 struct ocfs2_alloc_reservation *tmp;
322
323 assert_spin_locked(&resv_lock);
324
325 mlog(0, "Insert reservation start: %u len: %u\n", new->r_start,
326 new->r_len);
327
328 while (*p) {
329 parent = *p;
330
331 tmp = rb_entry(parent, struct ocfs2_alloc_reservation, r_node);
332
333 if (new->r_start < tmp->r_start) {
334 p = &(*p)->rb_left;
335
336 /*
337 * This is a good place to check for
338 * overlapping reservations.
339 */
340 BUG_ON(ocfs2_resv_end(new) >= tmp->r_start);
341 } else if (new->r_start > ocfs2_resv_end(tmp)) {
342 p = &(*p)->rb_right;
343 } else {
344 /* This should never happen! */
345 mlog(ML_ERROR, "Duplicate reservation window!\n");
346 BUG();
347 }
348 }
349
350 rb_link_node(&new->r_node, parent, p);
351 rb_insert_color(&new->r_node, root);
352 new->r_flags |= OCFS2_RESV_FLAG_INUSE;
353
354 ocfs2_resv_mark_lru(resmap, new);
355
356 ocfs2_check_resmap(resmap);
357}
358
359/**
360 * ocfs2_find_resv_lhs() - find the window which contains goal
361 * @resmap: reservation map to search
362 * @goal: which bit to search for
363 *
364 * If a window containing that goal is not found, we return the window
365 * which comes before goal. Returns NULL on empty rbtree or no window
366 * before goal.
367 */
368static struct ocfs2_alloc_reservation *
369ocfs2_find_resv_lhs(struct ocfs2_reservation_map *resmap, unsigned int goal)
370{
371 struct ocfs2_alloc_reservation *resv = NULL;
372 struct ocfs2_alloc_reservation *prev_resv = NULL;
373 struct rb_node *node = resmap->m_reservations.rb_node;
374
375 assert_spin_locked(&resv_lock);
376
377 if (!node)
378 return NULL;
379
380 node = rb_first(&resmap->m_reservations);
381 while (node) {
382 resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
383
384 if (resv->r_start <= goal && ocfs2_resv_end(resv) >= goal)
385 break;
386
387 /* Check if we overshot the reservation just before goal? */
388 if (resv->r_start > goal) {
389 resv = prev_resv;
390 break;
391 }
392
393 prev_resv = resv;
394 node = rb_next(node);
395 }
396
397 return resv;
398}
399
400/*
401 * We are given a range within the bitmap, which corresponds to a gap
402 * inside the reservations tree (search_start, search_len). The range
403 * can be anything from the whole bitmap, to a gap between
404 * reservations.
405 *
406 * The start value of *rstart is insignificant.
407 *
408 * This function searches the bitmap range starting at search_start
409 * with length search_len for a set of contiguous free bits. We try
410 * to find up to 'wanted' bits, but can sometimes return less.
411 *
412 * Returns the length of allocation, 0 if no free bits are found.
413 *
414 * *cstart and *clen will also be populated with the result.
415 */
416static int ocfs2_resmap_find_free_bits(struct ocfs2_reservation_map *resmap,
417 unsigned int wanted,
418 unsigned int search_start,
419 unsigned int search_len,
420 unsigned int *rstart,
421 unsigned int *rlen)
422{
423 void *bitmap = resmap->m_disk_bitmap;
424 unsigned int best_start, best_len = 0;
425 int offset, start, found;
426
427 mlog(0, "Find %u bits within range (%u, len %u) resmap len: %u\n",
428 wanted, search_start, search_len, resmap->m_bitmap_len);
429
430 found = best_start = best_len = 0;
431
432 start = search_start;
433 while ((offset = ocfs2_find_next_zero_bit(bitmap, resmap->m_bitmap_len,
434 start)) != -1) {
435 /* Search reached end of the region */
436 if (offset >= (search_start + search_len))
437 break;
438
439 if (offset == start) {
440 /* we found a zero */
441 found++;
442 /* move start to the next bit to test */
443 start++;
444 } else {
445 /* got a zero after some ones */
446 found = 1;
447 start = offset + 1;
448 }
449 if (found > best_len) {
450 best_len = found;
451 best_start = start - found;
452 }
453
454 if (found >= wanted)
455 break;
456 }
457
458 if (best_len == 0)
459 return 0;
460
461 if (best_len >= wanted)
462 best_len = wanted;
463
464 *rlen = best_len;
465 *rstart = best_start;
466
467 mlog(0, "Found start: %u len: %u\n", best_start, best_len);
468
469 return *rlen;
470}
471
472static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
473 struct ocfs2_alloc_reservation *resv,
474 unsigned int goal, unsigned int wanted)
475{
476 struct rb_root *root = &resmap->m_reservations;
477 unsigned int gap_start, gap_end, gap_len;
478 struct ocfs2_alloc_reservation *prev_resv, *next_resv;
479 struct rb_node *prev, *next;
480 unsigned int cstart, clen;
481 unsigned int best_start = 0, best_len = 0;
482
483 /*
484 * Nasty cases to consider:
485 *
486 * - rbtree is empty
487 * - our window should be first in all reservations
488 * - our window should be last in all reservations
489 * - need to make sure we don't go past end of bitmap
490 */
491
492 mlog(0, "resv start: %u resv end: %u goal: %u wanted: %u\n",
493 resv->r_start, ocfs2_resv_end(resv), goal, wanted);
494
495 assert_spin_locked(&resv_lock);
496
497 if (RB_EMPTY_ROOT(root)) {
498 /*
499 * Easiest case - empty tree. We can just take
500 * whatever window of free bits we want.
501 */
502
503 mlog(0, "Empty root\n");
504
505 clen = ocfs2_resmap_find_free_bits(resmap, wanted, goal,
506 resmap->m_bitmap_len - goal,
507 &cstart, &clen);
508
509 /*
510 * This should never happen - the local alloc window
511 * will always have free bits when we're called.
512 */
513 BUG_ON(goal == 0 && clen == 0);
514
515 if (clen == 0)
516 return;
517
518 resv->r_start = cstart;
519 resv->r_len = clen;
520
521 ocfs2_resv_insert(resmap, resv);
522 return;
523 }
524
525 prev_resv = ocfs2_find_resv_lhs(resmap, goal);
526
527 if (prev_resv == NULL) {
528 mlog(0, "Goal on LHS of leftmost window\n");
529
530 /*
531 * A NULL here means that the search code couldn't
532 * find a window that starts before goal.
533 *
534 * However, we can take the first window after goal,
535 * which is also by definition, the leftmost window in
536 * the entire tree. If we can find free bits in the
537 * gap between goal and the LHS window, then the
538 * reservation can safely be placed there.
539 *
540 * Otherwise we fall back to a linear search, checking
541 * the gaps in between windows for a place to
542 * allocate.
543 */
544
545 next = rb_first(root);
546 next_resv = rb_entry(next, struct ocfs2_alloc_reservation,
547 r_node);
548
549 /*
550 * The search should never return such a window. (see
551 * comment above
552 */
553 if (next_resv->r_start <= goal) {
554 mlog(ML_ERROR, "goal: %u next_resv: start %u len %u\n",
555 goal, next_resv->r_start, next_resv->r_len);
556 ocfs2_dump_resv(resmap);
557 BUG();
558 }
559
560 clen = ocfs2_resmap_find_free_bits(resmap, wanted, goal,
561 next_resv->r_start - goal,
562 &cstart, &clen);
563 if (clen) {
564 best_len = clen;
565 best_start = cstart;
566 if (best_len == wanted)
567 goto out_insert;
568 }
569
570 prev_resv = next_resv;
571 next_resv = NULL;
572 }
573
574 prev = &prev_resv->r_node;
575
576 /* Now we do a linear search for a window, starting at 'prev_rsv' */
577 while (1) {
578 next = rb_next(prev);
579 if (next) {
580 mlog(0, "One more resv found in linear search\n");
581 next_resv = rb_entry(next,
582 struct ocfs2_alloc_reservation,
583 r_node);
584
585 gap_start = ocfs2_resv_end(prev_resv) + 1;
586 gap_end = next_resv->r_start - 1;
587 gap_len = gap_end - gap_start + 1;
588 } else {
589 mlog(0, "No next node\n");
590 /*
591 * We're at the rightmost edge of the
592 * tree. See if a reservation between this
593 * window and the end of the bitmap will work.
594 */
595 gap_start = ocfs2_resv_end(prev_resv) + 1;
596 gap_len = resmap->m_bitmap_len - gap_start;
597 gap_end = resmap->m_bitmap_len - 1;
598 }
599
600 /*
601 * No need to check this gap if we have already found
602 * a larger region of free bits.
603 */
604 if (gap_len <= best_len)
605 goto next_resv;
606
607 clen = ocfs2_resmap_find_free_bits(resmap, wanted, gap_start,
608 gap_len, &cstart, &clen);
609 if (clen == wanted) {
610 best_len = clen;
611 best_start = cstart;
612 goto out_insert;
613 } else if (clen > best_len) {
614 best_len = clen;
615 best_start = cstart;
616 }
617
618next_resv:
619 if (!next)
620 break;
621
622 prev = next;
623 prev_resv = rb_entry(prev, struct ocfs2_alloc_reservation,
624 r_node);
625 }
626
627out_insert:
628 if (best_len) {
629 resv->r_start = best_start;
630 resv->r_len = best_len;
631 ocfs2_resv_insert(resmap, resv);
632 }
633}
634
635static void ocfs2_cannibalize_resv(struct ocfs2_reservation_map *resmap,
636 struct ocfs2_alloc_reservation *resv,
637 unsigned int wanted)
638{
639 struct ocfs2_alloc_reservation *lru_resv;
640 int tmpwindow = !!(resv->r_flags & OCFS2_RESV_FLAG_TMP);
641 unsigned int min_bits;
642
643 if (!tmpwindow)
644 min_bits = ocfs2_resv_window_bits(resmap, resv) >> 1;
645 else
646 min_bits = wanted; /* We at know the temp window will use all
647 * of these bits */
648
649 /*
650 * Take the first reservation off the LRU as our 'target'. We
651 * don't try to be smart about it. There might be a case for
652 * searching based on size but I don't have enough data to be
653 * sure. --Mark (3/16/2010)
654 */
655 lru_resv = list_first_entry(&resmap->m_lru,
656 struct ocfs2_alloc_reservation, r_lru);
657
658 mlog(0, "lru resv: start: %u len: %u end: %u\n", lru_resv->r_start,
659 lru_resv->r_len, ocfs2_resv_end(lru_resv));
660
661 /*
662 * Cannibalize (some or all) of the target reservation and
663 * feed it to the current window.
664 */
665 if (lru_resv->r_len <= min_bits) {
666 /*
667 * Discard completely if size is less than or equal to a
668 * reasonable threshold - 50% of window bits for non temporary
669 * windows.
670 */
671 resv->r_start = lru_resv->r_start;
672 resv->r_len = lru_resv->r_len;
673
674 __ocfs2_resv_discard(resmap, lru_resv);
675 } else {
676 unsigned int shrink;
677 if (tmpwindow)
678 shrink = min_bits;
679 else
680 shrink = lru_resv->r_len / 2;
681
682 lru_resv->r_len -= shrink;
683
684 resv->r_start = ocfs2_resv_end(lru_resv) + 1;
685 resv->r_len = shrink;
686 }
687
688 mlog(0, "Reservation now looks like: r_start: %u r_end: %u "
689 "r_len: %u r_last_start: %u r_last_len: %u\n",
690 resv->r_start, ocfs2_resv_end(resv), resv->r_len,
691 resv->r_last_start, resv->r_last_len);
692
693 ocfs2_resv_insert(resmap, resv);
694}
695
696static void ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
697 struct ocfs2_alloc_reservation *resv,
698 unsigned int wanted)
699{
700 unsigned int goal = 0;
701
702 BUG_ON(!ocfs2_resv_empty(resv));
703
704 /*
705 * Begin by trying to get a window as close to the previous
706 * one as possible. Using the most recent allocation as a
707 * start goal makes sense.
708 */
709 if (resv->r_last_len) {
710 goal = resv->r_last_start + resv->r_last_len;
711 if (goal >= resmap->m_bitmap_len)
712 goal = 0;
713 }
714
715 __ocfs2_resv_find_window(resmap, resv, goal, wanted);
716
717 /* Search from last alloc didn't work, try once more from beginning. */
718 if (ocfs2_resv_empty(resv) && goal != 0)
719 __ocfs2_resv_find_window(resmap, resv, 0, wanted);
720
721 if (ocfs2_resv_empty(resv)) {
722 /*
723 * Still empty? Pull oldest one off the LRU, remove it from
724 * tree, put this one in it's place.
725 */
726 ocfs2_cannibalize_resv(resmap, resv, wanted);
727 }
728
729 BUG_ON(ocfs2_resv_empty(resv));
730}
731
732int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap,
733 struct ocfs2_alloc_reservation *resv,
734 int *cstart, int *clen)
735{
736 unsigned int wanted = *clen;
737
738 if (resv == NULL || ocfs2_resmap_disabled(resmap))
739 return -ENOSPC;
740
741 spin_lock(&resv_lock);
742
743 /*
744 * We don't want to over-allocate for temporary
745 * windows. Otherwise, we run the risk of fragmenting the
746 * allocation space.
747 */
748 wanted = ocfs2_resv_window_bits(resmap, resv);
749 if ((resv->r_flags & OCFS2_RESV_FLAG_TMP) || wanted < *clen)
750 wanted = *clen;
751
752 if (ocfs2_resv_empty(resv)) {
753 mlog(0, "empty reservation, find new window\n");
754
755 /*
756 * Try to get a window here. If it works, we must fall
757 * through and test the bitmap . This avoids some
758 * ping-ponging of windows due to non-reserved space
759 * being allocation before we initialize a window for
760 * that inode.
761 */
762 ocfs2_resv_find_window(resmap, resv, wanted);
763 }
764
765 BUG_ON(ocfs2_resv_empty(resv));
766
767 *cstart = resv->r_start;
768 *clen = resv->r_len;
769
770 spin_unlock(&resv_lock);
771 return 0;
772}
773
774static void
775 ocfs2_adjust_resv_from_alloc(struct ocfs2_reservation_map *resmap,
776 struct ocfs2_alloc_reservation *resv,
777 unsigned int start, unsigned int end)
778{
779 unsigned int rhs = 0;
780 unsigned int old_end = ocfs2_resv_end(resv);
781
782 BUG_ON(start != resv->r_start || old_end < end);
783
784 /*
785 * Completely used? We can remove it then.
786 */
787 if (old_end == end) {
788 __ocfs2_resv_discard(resmap, resv);
789 return;
790 }
791
792 rhs = old_end - end;
793
794 /*
795 * This should have been trapped above.
796 */
797 BUG_ON(rhs == 0);
798
799 resv->r_start = end + 1;
800 resv->r_len = old_end - resv->r_start + 1;
801}
802
803void ocfs2_resmap_claimed_bits(struct ocfs2_reservation_map *resmap,
804 struct ocfs2_alloc_reservation *resv,
805 u32 cstart, u32 clen)
806{
807 unsigned int cend = cstart + clen - 1;
808
809 if (resmap == NULL || ocfs2_resmap_disabled(resmap))
810 return;
811
812 if (resv == NULL)
813 return;
814
815 BUG_ON(cstart != resv->r_start);
816
817 spin_lock(&resv_lock);
818
819 mlog(0, "claim bits: cstart: %u cend: %u clen: %u r_start: %u "
820 "r_end: %u r_len: %u, r_last_start: %u r_last_len: %u\n",
821 cstart, cend, clen, resv->r_start, ocfs2_resv_end(resv),
822 resv->r_len, resv->r_last_start, resv->r_last_len);
823
824 BUG_ON(cstart < resv->r_start);
825 BUG_ON(cstart > ocfs2_resv_end(resv));
826 BUG_ON(cend > ocfs2_resv_end(resv));
827
828 ocfs2_adjust_resv_from_alloc(resmap, resv, cstart, cend);
829 resv->r_last_start = cstart;
830 resv->r_last_len = clen;
831
832 /*
833 * May have been discarded above from
834 * ocfs2_adjust_resv_from_alloc().
835 */
836 if (!ocfs2_resv_empty(resv))
837 ocfs2_resv_mark_lru(resmap, resv);
838
839 mlog(0, "Reservation now looks like: r_start: %u r_end: %u "
840 "r_len: %u r_last_start: %u r_last_len: %u\n",
841 resv->r_start, ocfs2_resv_end(resv), resv->r_len,
842 resv->r_last_start, resv->r_last_len);
843
844 ocfs2_check_resmap(resmap);
845
846 spin_unlock(&resv_lock);
847}
diff --git a/fs/ocfs2/reservations.h b/fs/ocfs2/reservations.h
new file mode 100644
index 000000000000..1e49cc29d06c
--- /dev/null
+++ b/fs/ocfs2/reservations.h
@@ -0,0 +1,159 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * reservations.h
5 *
6 * Allocation reservations function prototypes and structures.
7 *
8 * Copyright (C) 2010 Novell. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License version 2 as published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 */
19
20#ifndef OCFS2_RESERVATIONS_H
21#define OCFS2_RESERVATIONS_H
22
23#include <linux/rbtree.h>
24
25#define OCFS2_DEFAULT_RESV_LEVEL 2
26#define OCFS2_MAX_RESV_LEVEL 9
27#define OCFS2_MIN_RESV_LEVEL 0
28
29struct ocfs2_alloc_reservation {
30 struct rb_node r_node;
31
32 unsigned int r_start; /* Begining of current window */
33 unsigned int r_len; /* Length of the window */
34
35 unsigned int r_last_len; /* Length of most recent alloc */
36 unsigned int r_last_start; /* Start of most recent alloc */
37 struct list_head r_lru; /* LRU list head */
38
39 unsigned int r_flags;
40};
41
42#define OCFS2_RESV_FLAG_INUSE 0x01 /* Set when r_node is part of a btree */
43#define OCFS2_RESV_FLAG_TMP 0x02 /* Temporary reservation, will be
44 * destroyed immedately after use */
45#define OCFS2_RESV_FLAG_DIR 0x04 /* Reservation is for an unindexed
46 * directory btree */
47
48struct ocfs2_reservation_map {
49 struct rb_root m_reservations;
50 char *m_disk_bitmap;
51
52 struct ocfs2_super *m_osb;
53
54 /* The following are not initialized to meaningful values until a disk
55 * bitmap is provided. */
56 u32 m_bitmap_len; /* Number of valid
57 * bits available */
58
59 struct list_head m_lru; /* LRU of reservations
60 * structures. */
61
62};
63
64void ocfs2_resv_init_once(struct ocfs2_alloc_reservation *resv);
65
66#define OCFS2_RESV_TYPES (OCFS2_RESV_FLAG_TMP|OCFS2_RESV_FLAG_DIR)
67void ocfs2_resv_set_type(struct ocfs2_alloc_reservation *resv,
68 unsigned int flags);
69
70int ocfs2_dir_resv_allowed(struct ocfs2_super *osb);
71
72/**
73 * ocfs2_resv_discard() - truncate a reservation
74 * @resmap:
75 * @resv: the reservation to truncate.
76 *
77 * After this function is called, the reservation will be empty, and
78 * unlinked from the rbtree.
79 */
80void ocfs2_resv_discard(struct ocfs2_reservation_map *resmap,
81 struct ocfs2_alloc_reservation *resv);
82
83
84/**
85 * ocfs2_resmap_init() - Initialize fields of a reservations bitmap
86 * @resmap: struct ocfs2_reservation_map to initialize
87 * @obj: unused for now
88 * @ops: unused for now
89 * @max_bitmap_bytes: Maximum size of the bitmap (typically blocksize)
90 *
91 * Only possible return value other than '0' is -ENOMEM for failure to
92 * allocation mirror bitmap.
93 */
94int ocfs2_resmap_init(struct ocfs2_super *osb,
95 struct ocfs2_reservation_map *resmap);
96
97/**
98 * ocfs2_resmap_restart() - "restart" a reservation bitmap
99 * @resmap: reservations bitmap
100 * @clen: Number of valid bits in the bitmap
101 * @disk_bitmap: the disk bitmap this resmap should refer to.
102 *
103 * Re-initialize the parameters of a reservation bitmap. This is
104 * useful for local alloc window slides.
105 *
106 * This function will call ocfs2_trunc_resv against all existing
107 * reservations. A future version will recalculate existing
108 * reservations based on the new bitmap.
109 */
110void ocfs2_resmap_restart(struct ocfs2_reservation_map *resmap,
111 unsigned int clen, char *disk_bitmap);
112
113/**
114 * ocfs2_resmap_uninit() - uninitialize a reservation bitmap structure
115 * @resmap: the struct ocfs2_reservation_map to uninitialize
116 */
117void ocfs2_resmap_uninit(struct ocfs2_reservation_map *resmap);
118
119/**
120 * ocfs2_resmap_resv_bits() - Return still-valid reservation bits
121 * @resmap: reservations bitmap
122 * @resv: reservation to base search from
123 * @cstart: start of proposed allocation
124 * @clen: length (in clusters) of proposed allocation
125 *
126 * Using the reservation data from resv, this function will compare
127 * resmap and resmap->m_disk_bitmap to determine what part (if any) of
128 * the reservation window is still clear to use. If resv is empty,
129 * this function will try to allocate a window for it.
130 *
131 * On success, zero is returned and the valid allocation area is set in cstart
132 * and clen.
133 *
134 * Returns -ENOSPC if reservations are disabled.
135 */
136int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap,
137 struct ocfs2_alloc_reservation *resv,
138 int *cstart, int *clen);
139
140/**
141 * ocfs2_resmap_claimed_bits() - Tell the reservation code that bits were used.
142 * @resmap: reservations bitmap
143 * @resv: optional reservation to recalulate based on new bitmap
144 * @cstart: start of allocation in clusters
145 * @clen: end of allocation in clusters.
146 *
147 * Tell the reservation code that bits were used to fulfill allocation in
148 * resmap. The bits don't have to have been part of any existing
149 * reservation. But we must always call this function when bits are claimed.
150 * Internally, the reservations code will use this information to mark the
151 * reservations bitmap. If resv is passed, it's next allocation window will be
152 * calculated. It also expects that 'cstart' is the same as we passed back
153 * from ocfs2_resmap_resv_bits().
154 */
155void ocfs2_resmap_claimed_bits(struct ocfs2_reservation_map *resmap,
156 struct ocfs2_alloc_reservation *resv,
157 u32 cstart, u32 clen);
158
159#endif /* OCFS2_RESERVATIONS_H */
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index 3c3d673a4d20..dacd553d8617 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -134,11 +134,7 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
134 le16_add_cpu(&group->bg_free_bits_count, -1 * backups); 134 le16_add_cpu(&group->bg_free_bits_count, -1 * backups);
135 } 135 }
136 136
137 ret = ocfs2_journal_dirty(handle, group_bh); 137 ocfs2_journal_dirty(handle, group_bh);
138 if (ret < 0) {
139 mlog_errno(ret);
140 goto out_rollback;
141 }
142 138
143 /* update the inode accordingly. */ 139 /* update the inode accordingly. */
144 ret = ocfs2_journal_access_di(handle, INODE_CACHE(bm_inode), bm_bh, 140 ret = ocfs2_journal_access_di(handle, INODE_CACHE(bm_inode), bm_bh,
@@ -319,7 +315,8 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
319 BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); 315 BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
320 316
321 if (le16_to_cpu(fe->id2.i_chain.cl_cpg) != 317 if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
322 ocfs2_group_bitmap_size(osb->sb) * 8) { 318 ocfs2_group_bitmap_size(osb->sb, 0,
319 osb->s_feature_incompat) * 8) {
323 mlog(ML_ERROR, "The disk is too old and small. " 320 mlog(ML_ERROR, "The disk is too old and small. "
324 "Force to do offline resize."); 321 "Force to do offline resize.");
325 ret = -EINVAL; 322 ret = -EINVAL;
@@ -500,7 +497,8 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
500 fe = (struct ocfs2_dinode *)main_bm_bh->b_data; 497 fe = (struct ocfs2_dinode *)main_bm_bh->b_data;
501 498
502 if (le16_to_cpu(fe->id2.i_chain.cl_cpg) != 499 if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
503 ocfs2_group_bitmap_size(osb->sb) * 8) { 500 ocfs2_group_bitmap_size(osb->sb, 0,
501 osb->s_feature_incompat) * 8) {
504 mlog(ML_ERROR, "The disk is too old and small." 502 mlog(ML_ERROR, "The disk is too old and small."
505 " Force to do offline resize."); 503 " Force to do offline resize.");
506 ret = -EINVAL; 504 ret = -EINVAL;
@@ -545,12 +543,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
545 543
546 group = (struct ocfs2_group_desc *)group_bh->b_data; 544 group = (struct ocfs2_group_desc *)group_bh->b_data;
547 group->bg_next_group = cr->c_blkno; 545 group->bg_next_group = cr->c_blkno;
548 546 ocfs2_journal_dirty(handle, group_bh);
549 ret = ocfs2_journal_dirty(handle, group_bh);
550 if (ret < 0) {
551 mlog_errno(ret);
552 goto out_commit;
553 }
554 547
555 ret = ocfs2_journal_access_di(handle, INODE_CACHE(main_bm_inode), 548 ret = ocfs2_journal_access_di(handle, INODE_CACHE(main_bm_inode),
556 main_bm_bh, OCFS2_JOURNAL_ACCESS_WRITE); 549 main_bm_bh, OCFS2_JOURNAL_ACCESS_WRITE);
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index 7020e1253ffa..0d3049f696c5 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -19,6 +19,7 @@
19 19
20#include <linux/kernel.h> 20#include <linux/kernel.h>
21#include <linux/crc32.h> 21#include <linux/crc32.h>
22#include <linux/slab.h>
22#include <linux/module.h> 23#include <linux/module.h>
23 24
24/* Needed for AOP_TRUNCATED_PAGE in mlog_errno() */ 25/* Needed for AOP_TRUNCATED_PAGE in mlog_errno() */
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 5ae8812b2864..2dc57bca0688 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -21,6 +21,7 @@
21#include <linux/fs.h> 21#include <linux/fs.h>
22#include <linux/miscdevice.h> 22#include <linux/miscdevice.h>
23#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/slab.h>
24#include <linux/smp_lock.h> 25#include <linux/smp_lock.h>
25#include <linux/reboot.h> 26#include <linux/reboot.h>
26#include <asm/uaccess.h> 27#include <asm/uaccess.h>
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index c3c60bc3e072..f4c2a9eb8c4d 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -53,6 +53,15 @@
53 53
54#define OCFS2_MAX_TO_STEAL 1024 54#define OCFS2_MAX_TO_STEAL 1024
55 55
56struct ocfs2_suballoc_result {
57 u64 sr_bg_blkno; /* The bg we allocated from. Set
58 to 0 when a block group is
59 contiguous. */
60 u64 sr_blkno; /* The first allocated block */
61 unsigned int sr_bit_offset; /* The bit in the bg */
62 unsigned int sr_bits; /* How many bits we claimed */
63};
64
56static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg); 65static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
57static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe); 66static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
58static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl); 67static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
@@ -60,6 +69,7 @@ static int ocfs2_block_group_fill(handle_t *handle,
60 struct inode *alloc_inode, 69 struct inode *alloc_inode,
61 struct buffer_head *bg_bh, 70 struct buffer_head *bg_bh,
62 u64 group_blkno, 71 u64 group_blkno,
72 unsigned int group_clusters,
63 u16 my_chain, 73 u16 my_chain,
64 struct ocfs2_chain_list *cl); 74 struct ocfs2_chain_list *cl);
65static int ocfs2_block_group_alloc(struct ocfs2_super *osb, 75static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
@@ -73,20 +83,17 @@ static int ocfs2_cluster_group_search(struct inode *inode,
73 struct buffer_head *group_bh, 83 struct buffer_head *group_bh,
74 u32 bits_wanted, u32 min_bits, 84 u32 bits_wanted, u32 min_bits,
75 u64 max_block, 85 u64 max_block,
76 u16 *bit_off, u16 *bits_found); 86 struct ocfs2_suballoc_result *res);
77static int ocfs2_block_group_search(struct inode *inode, 87static int ocfs2_block_group_search(struct inode *inode,
78 struct buffer_head *group_bh, 88 struct buffer_head *group_bh,
79 u32 bits_wanted, u32 min_bits, 89 u32 bits_wanted, u32 min_bits,
80 u64 max_block, 90 u64 max_block,
81 u16 *bit_off, u16 *bits_found); 91 struct ocfs2_suballoc_result *res);
82static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb, 92static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
83 struct ocfs2_alloc_context *ac,
84 handle_t *handle, 93 handle_t *handle,
85 u32 bits_wanted, 94 u32 bits_wanted,
86 u32 min_bits, 95 u32 min_bits,
87 u16 *bit_off, 96 struct ocfs2_suballoc_result *res);
88 unsigned int *num_bits,
89 u64 *bg_blkno);
90static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, 97static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
91 int nr); 98 int nr);
92static inline int ocfs2_block_group_set_bits(handle_t *handle, 99static inline int ocfs2_block_group_set_bits(handle_t *handle,
@@ -95,13 +102,6 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
95 struct buffer_head *group_bh, 102 struct buffer_head *group_bh,
96 unsigned int bit_off, 103 unsigned int bit_off,
97 unsigned int num_bits); 104 unsigned int num_bits);
98static inline int ocfs2_block_group_clear_bits(handle_t *handle,
99 struct inode *alloc_inode,
100 struct ocfs2_group_desc *bg,
101 struct buffer_head *group_bh,
102 unsigned int bit_off,
103 unsigned int num_bits);
104
105static int ocfs2_relink_block_group(handle_t *handle, 105static int ocfs2_relink_block_group(handle_t *handle,
106 struct inode *alloc_inode, 106 struct inode *alloc_inode,
107 struct buffer_head *fe_bh, 107 struct buffer_head *fe_bh,
@@ -137,6 +137,7 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
137 } 137 }
138 brelse(ac->ac_bh); 138 brelse(ac->ac_bh);
139 ac->ac_bh = NULL; 139 ac->ac_bh = NULL;
140 ac->ac_resv = NULL;
140} 141}
141 142
142void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac) 143void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
@@ -152,7 +153,7 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
152 153
153#define do_error(fmt, ...) \ 154#define do_error(fmt, ...) \
154 do{ \ 155 do{ \
155 if (clean_error) \ 156 if (resize) \
156 mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__); \ 157 mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__); \
157 else \ 158 else \
158 ocfs2_error(sb, fmt, ##__VA_ARGS__); \ 159 ocfs2_error(sb, fmt, ##__VA_ARGS__); \
@@ -160,7 +161,7 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
160 161
161static int ocfs2_validate_gd_self(struct super_block *sb, 162static int ocfs2_validate_gd_self(struct super_block *sb,
162 struct buffer_head *bh, 163 struct buffer_head *bh,
163 int clean_error) 164 int resize)
164{ 165{
165 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; 166 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
166 167
@@ -211,7 +212,7 @@ static int ocfs2_validate_gd_self(struct super_block *sb,
211static int ocfs2_validate_gd_parent(struct super_block *sb, 212static int ocfs2_validate_gd_parent(struct super_block *sb,
212 struct ocfs2_dinode *di, 213 struct ocfs2_dinode *di,
213 struct buffer_head *bh, 214 struct buffer_head *bh,
214 int clean_error) 215 int resize)
215{ 216{
216 unsigned int max_bits; 217 unsigned int max_bits;
217 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; 218 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
@@ -233,8 +234,11 @@ static int ocfs2_validate_gd_parent(struct super_block *sb,
233 return -EINVAL; 234 return -EINVAL;
234 } 235 }
235 236
236 if (le16_to_cpu(gd->bg_chain) >= 237 /* In resize, we may meet the case bg_chain == cl_next_free_rec. */
237 le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) { 238 if ((le16_to_cpu(gd->bg_chain) >
239 le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) ||
240 ((le16_to_cpu(gd->bg_chain) ==
241 le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) {
238 do_error("Group descriptor #%llu has bad chain %u", 242 do_error("Group descriptor #%llu has bad chain %u",
239 (unsigned long long)bh->b_blocknr, 243 (unsigned long long)bh->b_blocknr,
240 le16_to_cpu(gd->bg_chain)); 244 le16_to_cpu(gd->bg_chain));
@@ -329,14 +333,38 @@ out:
329 return rc; 333 return rc;
330} 334}
331 335
336static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb,
337 struct ocfs2_group_desc *bg,
338 struct ocfs2_chain_list *cl,
339 u64 p_blkno, u32 clusters)
340{
341 struct ocfs2_extent_list *el = &bg->bg_list;
342 struct ocfs2_extent_rec *rec;
343
344 BUG_ON(!ocfs2_supports_discontig_bg(osb));
345 if (!el->l_next_free_rec)
346 el->l_count = cpu_to_le16(ocfs2_extent_recs_per_gd(osb->sb));
347 rec = &el->l_recs[le16_to_cpu(el->l_next_free_rec)];
348 rec->e_blkno = cpu_to_le64(p_blkno);
349 rec->e_cpos = cpu_to_le32(le16_to_cpu(bg->bg_bits) /
350 le16_to_cpu(cl->cl_bpc));
351 rec->e_leaf_clusters = cpu_to_le32(clusters);
352 le16_add_cpu(&bg->bg_bits, clusters * le16_to_cpu(cl->cl_bpc));
353 le16_add_cpu(&bg->bg_free_bits_count,
354 clusters * le16_to_cpu(cl->cl_bpc));
355 le16_add_cpu(&el->l_next_free_rec, 1);
356}
357
332static int ocfs2_block_group_fill(handle_t *handle, 358static int ocfs2_block_group_fill(handle_t *handle,
333 struct inode *alloc_inode, 359 struct inode *alloc_inode,
334 struct buffer_head *bg_bh, 360 struct buffer_head *bg_bh,
335 u64 group_blkno, 361 u64 group_blkno,
362 unsigned int group_clusters,
336 u16 my_chain, 363 u16 my_chain,
337 struct ocfs2_chain_list *cl) 364 struct ocfs2_chain_list *cl)
338{ 365{
339 int status = 0; 366 int status = 0;
367 struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
340 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; 368 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
341 struct super_block * sb = alloc_inode->i_sb; 369 struct super_block * sb = alloc_inode->i_sb;
342 370
@@ -363,19 +391,23 @@ static int ocfs2_block_group_fill(handle_t *handle,
363 memset(bg, 0, sb->s_blocksize); 391 memset(bg, 0, sb->s_blocksize);
364 strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE); 392 strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
365 bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation); 393 bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
366 bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb)); 394 bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb, 1,
367 bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl)); 395 osb->s_feature_incompat));
368 bg->bg_chain = cpu_to_le16(my_chain); 396 bg->bg_chain = cpu_to_le16(my_chain);
369 bg->bg_next_group = cl->cl_recs[my_chain].c_blkno; 397 bg->bg_next_group = cl->cl_recs[my_chain].c_blkno;
370 bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno); 398 bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno);
371 bg->bg_blkno = cpu_to_le64(group_blkno); 399 bg->bg_blkno = cpu_to_le64(group_blkno);
400 if (group_clusters == le16_to_cpu(cl->cl_cpg))
401 bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
402 else
403 ocfs2_bg_discontig_add_extent(osb, bg, cl, group_blkno,
404 group_clusters);
405
372 /* set the 1st bit in the bitmap to account for the descriptor block */ 406 /* set the 1st bit in the bitmap to account for the descriptor block */
373 ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap); 407 ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap);
374 bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1); 408 bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1);
375 409
376 status = ocfs2_journal_dirty(handle, bg_bh); 410 ocfs2_journal_dirty(handle, bg_bh);
377 if (status < 0)
378 mlog_errno(status);
379 411
380 /* There is no need to zero out or otherwise initialize the 412 /* There is no need to zero out or otherwise initialize the
381 * other blocks in a group - All valid FS metadata in a block 413 * other blocks in a group - All valid FS metadata in a block
@@ -401,6 +433,238 @@ static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
401 return best; 433 return best;
402} 434}
403 435
436static struct buffer_head *
437ocfs2_block_group_alloc_contig(struct ocfs2_super *osb, handle_t *handle,
438 struct inode *alloc_inode,
439 struct ocfs2_alloc_context *ac,
440 struct ocfs2_chain_list *cl)
441{
442 int status;
443 u32 bit_off, num_bits;
444 u64 bg_blkno;
445 struct buffer_head *bg_bh;
446 unsigned int alloc_rec = ocfs2_find_smallest_chain(cl);
447
448 status = ocfs2_claim_clusters(handle, ac,
449 le16_to_cpu(cl->cl_cpg), &bit_off,
450 &num_bits);
451 if (status < 0) {
452 if (status != -ENOSPC)
453 mlog_errno(status);
454 goto bail;
455 }
456
457 /* setup the group */
458 bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
459 mlog(0, "new descriptor, record %u, at block %llu\n",
460 alloc_rec, (unsigned long long)bg_blkno);
461
462 bg_bh = sb_getblk(osb->sb, bg_blkno);
463 if (!bg_bh) {
464 status = -EIO;
465 mlog_errno(status);
466 goto bail;
467 }
468 ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
469
470 status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh,
471 bg_blkno, num_bits, alloc_rec, cl);
472 if (status < 0) {
473 brelse(bg_bh);
474 mlog_errno(status);
475 }
476
477bail:
478 return status ? ERR_PTR(status) : bg_bh;
479}
480
481static int ocfs2_block_group_claim_bits(struct ocfs2_super *osb,
482 handle_t *handle,
483 struct ocfs2_alloc_context *ac,
484 unsigned int min_bits,
485 u32 *bit_off, u32 *num_bits)
486{
487 int status = 0;
488
489 while (min_bits) {
490 status = ocfs2_claim_clusters(handle, ac, min_bits,
491 bit_off, num_bits);
492 if (status != -ENOSPC)
493 break;
494
495 min_bits >>= 1;
496 }
497
498 return status;
499}
500
501static int ocfs2_block_group_grow_discontig(handle_t *handle,
502 struct inode *alloc_inode,
503 struct buffer_head *bg_bh,
504 struct ocfs2_alloc_context *ac,
505 struct ocfs2_chain_list *cl,
506 unsigned int min_bits)
507{
508 int status;
509 struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
510 struct ocfs2_group_desc *bg =
511 (struct ocfs2_group_desc *)bg_bh->b_data;
512 unsigned int needed = le16_to_cpu(cl->cl_cpg) -
513 le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc);
514 u32 p_cpos, clusters;
515 u64 p_blkno;
516 struct ocfs2_extent_list *el = &bg->bg_list;
517
518 status = ocfs2_journal_access_gd(handle,
519 INODE_CACHE(alloc_inode),
520 bg_bh,
521 OCFS2_JOURNAL_ACCESS_CREATE);
522 if (status < 0) {
523 mlog_errno(status);
524 goto bail;
525 }
526
527 while ((needed > 0) && (le16_to_cpu(el->l_next_free_rec) <
528 le16_to_cpu(el->l_count))) {
529 if (min_bits > needed)
530 min_bits = needed;
531 status = ocfs2_block_group_claim_bits(osb, handle, ac,
532 min_bits, &p_cpos,
533 &clusters);
534 if (status < 0) {
535 if (status != -ENOSPC)
536 mlog_errno(status);
537 goto bail;
538 }
539 p_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cpos);
540 ocfs2_bg_discontig_add_extent(osb, bg, cl, p_blkno,
541 clusters);
542
543 min_bits = clusters;
544 needed = le16_to_cpu(cl->cl_cpg) -
545 le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc);
546 }
547
548 if (needed > 0) {
549 /*
550 * We have used up all the extent rec but can't fill up
551 * the cpg. So bail out.
552 */
553 status = -ENOSPC;
554 goto bail;
555 }
556
557 ocfs2_journal_dirty(handle, bg_bh);
558
559bail:
560 return status;
561}
562
563static void ocfs2_bg_alloc_cleanup(handle_t *handle,
564 struct ocfs2_alloc_context *cluster_ac,
565 struct inode *alloc_inode,
566 struct buffer_head *bg_bh)
567{
568 int i, ret;
569 struct ocfs2_group_desc *bg;
570 struct ocfs2_extent_list *el;
571 struct ocfs2_extent_rec *rec;
572
573 if (!bg_bh)
574 return;
575
576 bg = (struct ocfs2_group_desc *)bg_bh->b_data;
577 el = &bg->bg_list;
578 for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
579 rec = &el->l_recs[i];
580 ret = ocfs2_free_clusters(handle, cluster_ac->ac_inode,
581 cluster_ac->ac_bh,
582 le64_to_cpu(rec->e_blkno),
583 le32_to_cpu(rec->e_leaf_clusters));
584 if (ret)
585 mlog_errno(ret);
586 /* Try all the clusters to free */
587 }
588
589 ocfs2_remove_from_cache(INODE_CACHE(alloc_inode), bg_bh);
590 brelse(bg_bh);
591}
592
593static struct buffer_head *
594ocfs2_block_group_alloc_discontig(handle_t *handle,
595 struct inode *alloc_inode,
596 struct ocfs2_alloc_context *ac,
597 struct ocfs2_chain_list *cl)
598{
599 int status;
600 u32 bit_off, num_bits;
601 u64 bg_blkno;
602 unsigned int min_bits = le16_to_cpu(cl->cl_cpg) >> 1;
603 struct buffer_head *bg_bh = NULL;
604 unsigned int alloc_rec = ocfs2_find_smallest_chain(cl);
605 struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
606
607 if (!ocfs2_supports_discontig_bg(osb)) {
608 status = -ENOSPC;
609 goto bail;
610 }
611
612 status = ocfs2_extend_trans(handle,
613 ocfs2_calc_bg_discontig_credits(osb->sb));
614 if (status) {
615 mlog_errno(status);
616 goto bail;
617 }
618
619 /*
620 * We're going to be grabbing from multiple cluster groups.
621 * We don't have enough credits to relink them all, and the
622 * cluster groups will be staying in cache for the duration of
623 * this operation.
624 */
625 ac->ac_allow_chain_relink = 0;
626
627 /* Claim the first region */
628 status = ocfs2_block_group_claim_bits(osb, handle, ac, min_bits,
629 &bit_off, &num_bits);
630 if (status < 0) {
631 if (status != -ENOSPC)
632 mlog_errno(status);
633 goto bail;
634 }
635 min_bits = num_bits;
636
637 /* setup the group */
638 bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
639 mlog(0, "new descriptor, record %u, at block %llu\n",
640 alloc_rec, (unsigned long long)bg_blkno);
641
642 bg_bh = sb_getblk(osb->sb, bg_blkno);
643 if (!bg_bh) {
644 status = -EIO;
645 mlog_errno(status);
646 goto bail;
647 }
648 ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
649
650 status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh,
651 bg_blkno, num_bits, alloc_rec, cl);
652 if (status < 0) {
653 mlog_errno(status);
654 goto bail;
655 }
656
657 status = ocfs2_block_group_grow_discontig(handle, alloc_inode,
658 bg_bh, ac, cl, min_bits);
659 if (status)
660 mlog_errno(status);
661
662bail:
663 if (status)
664 ocfs2_bg_alloc_cleanup(handle, ac, alloc_inode, bg_bh);
665 return status ? ERR_PTR(status) : bg_bh;
666}
667
404/* 668/*
405 * We expect the block group allocator to already be locked. 669 * We expect the block group allocator to already be locked.
406 */ 670 */
@@ -416,9 +680,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
416 struct ocfs2_chain_list *cl; 680 struct ocfs2_chain_list *cl;
417 struct ocfs2_alloc_context *ac = NULL; 681 struct ocfs2_alloc_context *ac = NULL;
418 handle_t *handle = NULL; 682 handle_t *handle = NULL;
419 u32 bit_off, num_bits;
420 u16 alloc_rec; 683 u16 alloc_rec;
421 u64 bg_blkno;
422 struct buffer_head *bg_bh = NULL; 684 struct buffer_head *bg_bh = NULL;
423 struct ocfs2_group_desc *bg; 685 struct ocfs2_group_desc *bg;
424 686
@@ -451,44 +713,20 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
451 (unsigned long long)*last_alloc_group); 713 (unsigned long long)*last_alloc_group);
452 ac->ac_last_group = *last_alloc_group; 714 ac->ac_last_group = *last_alloc_group;
453 } 715 }
454 status = ocfs2_claim_clusters(osb, 716
455 handle, 717 bg_bh = ocfs2_block_group_alloc_contig(osb, handle, alloc_inode,
456 ac, 718 ac, cl);
457 le16_to_cpu(cl->cl_cpg), 719 if (IS_ERR(bg_bh) && (PTR_ERR(bg_bh) == -ENOSPC))
458 &bit_off, 720 bg_bh = ocfs2_block_group_alloc_discontig(handle,
459 &num_bits); 721 alloc_inode,
460 if (status < 0) { 722 ac, cl);
723 if (IS_ERR(bg_bh)) {
724 status = PTR_ERR(bg_bh);
725 bg_bh = NULL;
461 if (status != -ENOSPC) 726 if (status != -ENOSPC)
462 mlog_errno(status); 727 mlog_errno(status);
463 goto bail; 728 goto bail;
464 } 729 }
465
466 alloc_rec = ocfs2_find_smallest_chain(cl);
467
468 /* setup the group */
469 bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
470 mlog(0, "new descriptor, record %u, at block %llu\n",
471 alloc_rec, (unsigned long long)bg_blkno);
472
473 bg_bh = sb_getblk(osb->sb, bg_blkno);
474 if (!bg_bh) {
475 status = -EIO;
476 mlog_errno(status);
477 goto bail;
478 }
479 ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
480
481 status = ocfs2_block_group_fill(handle,
482 alloc_inode,
483 bg_bh,
484 bg_blkno,
485 alloc_rec,
486 cl);
487 if (status < 0) {
488 mlog_errno(status);
489 goto bail;
490 }
491
492 bg = (struct ocfs2_group_desc *) bg_bh->b_data; 730 bg = (struct ocfs2_group_desc *) bg_bh->b_data;
493 731
494 status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), 732 status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
@@ -498,10 +736,12 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
498 goto bail; 736 goto bail;
499 } 737 }
500 738
739 alloc_rec = le16_to_cpu(bg->bg_chain);
501 le32_add_cpu(&cl->cl_recs[alloc_rec].c_free, 740 le32_add_cpu(&cl->cl_recs[alloc_rec].c_free,
502 le16_to_cpu(bg->bg_free_bits_count)); 741 le16_to_cpu(bg->bg_free_bits_count));
503 le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, le16_to_cpu(bg->bg_bits)); 742 le32_add_cpu(&cl->cl_recs[alloc_rec].c_total,
504 cl->cl_recs[alloc_rec].c_blkno = cpu_to_le64(bg_blkno); 743 le16_to_cpu(bg->bg_bits));
744 cl->cl_recs[alloc_rec].c_blkno = cpu_to_le64(bg->bg_blkno);
505 if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count)) 745 if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
506 le16_add_cpu(&cl->cl_next_free_rec, 1); 746 le16_add_cpu(&cl->cl_next_free_rec, 1);
507 747
@@ -510,11 +750,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
510 le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits)); 750 le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits));
511 le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg)); 751 le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg));
512 752
513 status = ocfs2_journal_dirty(handle, bh); 753 ocfs2_journal_dirty(handle, bh);
514 if (status < 0) {
515 mlog_errno(status);
516 goto bail;
517 }
518 754
519 spin_lock(&OCFS2_I(alloc_inode)->ip_lock); 755 spin_lock(&OCFS2_I(alloc_inode)->ip_lock);
520 OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters); 756 OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
@@ -764,7 +1000,7 @@ int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
764 status = ocfs2_reserve_suballoc_bits(osb, (*ac), 1000 status = ocfs2_reserve_suballoc_bits(osb, (*ac),
765 EXTENT_ALLOC_SYSTEM_INODE, 1001 EXTENT_ALLOC_SYSTEM_INODE,
766 (u32)osb->slot_num, NULL, 1002 (u32)osb->slot_num, NULL,
767 ALLOC_NEW_GROUP); 1003 ALLOC_GROUPS_FROM_GLOBAL|ALLOC_NEW_GROUP);
768 1004
769 1005
770 if (status >= 0) { 1006 if (status >= 0) {
@@ -950,11 +1186,7 @@ static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
950 status = ocfs2_reserve_local_alloc_bits(osb, 1186 status = ocfs2_reserve_local_alloc_bits(osb,
951 bits_wanted, 1187 bits_wanted,
952 *ac); 1188 *ac);
953 if (status == -EFBIG) { 1189 if ((status < 0) && (status != -ENOSPC)) {
954 /* The local alloc window is outside ac_max_block.
955 * use the main bitmap. */
956 status = -ENOSPC;
957 } else if ((status < 0) && (status != -ENOSPC)) {
958 mlog_errno(status); 1190 mlog_errno(status);
959 goto bail; 1191 goto bail;
960 } 1192 }
@@ -1037,8 +1269,7 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
1037 struct buffer_head *bg_bh, 1269 struct buffer_head *bg_bh,
1038 unsigned int bits_wanted, 1270 unsigned int bits_wanted,
1039 unsigned int total_bits, 1271 unsigned int total_bits,
1040 u16 *bit_off, 1272 struct ocfs2_suballoc_result *res)
1041 u16 *bits_found)
1042{ 1273{
1043 void *bitmap; 1274 void *bitmap;
1044 u16 best_offset, best_size; 1275 u16 best_offset, best_size;
@@ -1082,14 +1313,9 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
1082 } 1313 }
1083 } 1314 }
1084 1315
1085 /* XXX: I think the first clause is equivalent to the second 1316 if (best_size) {
1086 * - jlbec */ 1317 res->sr_bit_offset = best_offset;
1087 if (found == bits_wanted) { 1318 res->sr_bits = best_size;
1088 *bit_off = start - found;
1089 *bits_found = found;
1090 } else if (best_size) {
1091 *bit_off = best_offset;
1092 *bits_found = best_size;
1093 } else { 1319 } else {
1094 status = -ENOSPC; 1320 status = -ENOSPC;
1095 /* No error log here -- see the comment above 1321 /* No error log here -- see the comment above
@@ -1133,16 +1359,10 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
1133 } 1359 }
1134 1360
1135 le16_add_cpu(&bg->bg_free_bits_count, -num_bits); 1361 le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
1136
1137 while(num_bits--) 1362 while(num_bits--)
1138 ocfs2_set_bit(bit_off++, bitmap); 1363 ocfs2_set_bit(bit_off++, bitmap);
1139 1364
1140 status = ocfs2_journal_dirty(handle, 1365 ocfs2_journal_dirty(handle, group_bh);
1141 group_bh);
1142 if (status < 0) {
1143 mlog_errno(status);
1144 goto bail;
1145 }
1146 1366
1147bail: 1367bail:
1148 mlog_exit(status); 1368 mlog_exit(status);
@@ -1206,12 +1426,7 @@ static int ocfs2_relink_block_group(handle_t *handle,
1206 } 1426 }
1207 1427
1208 prev_bg->bg_next_group = bg->bg_next_group; 1428 prev_bg->bg_next_group = bg->bg_next_group;
1209 1429 ocfs2_journal_dirty(handle, prev_bg_bh);
1210 status = ocfs2_journal_dirty(handle, prev_bg_bh);
1211 if (status < 0) {
1212 mlog_errno(status);
1213 goto out_rollback;
1214 }
1215 1430
1216 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), 1431 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1217 bg_bh, OCFS2_JOURNAL_ACCESS_WRITE); 1432 bg_bh, OCFS2_JOURNAL_ACCESS_WRITE);
@@ -1221,12 +1436,7 @@ static int ocfs2_relink_block_group(handle_t *handle,
1221 } 1436 }
1222 1437
1223 bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno; 1438 bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
1224 1439 ocfs2_journal_dirty(handle, bg_bh);
1225 status = ocfs2_journal_dirty(handle, bg_bh);
1226 if (status < 0) {
1227 mlog_errno(status);
1228 goto out_rollback;
1229 }
1230 1440
1231 status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), 1441 status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
1232 fe_bh, OCFS2_JOURNAL_ACCESS_WRITE); 1442 fe_bh, OCFS2_JOURNAL_ACCESS_WRITE);
@@ -1236,14 +1446,8 @@ static int ocfs2_relink_block_group(handle_t *handle,
1236 } 1446 }
1237 1447
1238 fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno; 1448 fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
1449 ocfs2_journal_dirty(handle, fe_bh);
1239 1450
1240 status = ocfs2_journal_dirty(handle, fe_bh);
1241 if (status < 0) {
1242 mlog_errno(status);
1243 goto out_rollback;
1244 }
1245
1246 status = 0;
1247out_rollback: 1451out_rollback:
1248 if (status < 0) { 1452 if (status < 0) {
1249 fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr); 1453 fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr);
@@ -1267,14 +1471,13 @@ static int ocfs2_cluster_group_search(struct inode *inode,
1267 struct buffer_head *group_bh, 1471 struct buffer_head *group_bh,
1268 u32 bits_wanted, u32 min_bits, 1472 u32 bits_wanted, u32 min_bits,
1269 u64 max_block, 1473 u64 max_block,
1270 u16 *bit_off, u16 *bits_found) 1474 struct ocfs2_suballoc_result *res)
1271{ 1475{
1272 int search = -ENOSPC; 1476 int search = -ENOSPC;
1273 int ret; 1477 int ret;
1274 u64 blkoff; 1478 u64 blkoff;
1275 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data; 1479 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data;
1276 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1480 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1277 u16 tmp_off, tmp_found;
1278 unsigned int max_bits, gd_cluster_off; 1481 unsigned int max_bits, gd_cluster_off;
1279 1482
1280 BUG_ON(!ocfs2_is_cluster_bitmap(inode)); 1483 BUG_ON(!ocfs2_is_cluster_bitmap(inode));
@@ -1301,15 +1504,15 @@ static int ocfs2_cluster_group_search(struct inode *inode,
1301 1504
1302 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb), 1505 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
1303 group_bh, bits_wanted, 1506 group_bh, bits_wanted,
1304 max_bits, 1507 max_bits, res);
1305 &tmp_off, &tmp_found);
1306 if (ret) 1508 if (ret)
1307 return ret; 1509 return ret;
1308 1510
1309 if (max_block) { 1511 if (max_block) {
1310 blkoff = ocfs2_clusters_to_blocks(inode->i_sb, 1512 blkoff = ocfs2_clusters_to_blocks(inode->i_sb,
1311 gd_cluster_off + 1513 gd_cluster_off +
1312 tmp_off + tmp_found); 1514 res->sr_bit_offset +
1515 res->sr_bits);
1313 mlog(0, "Checking %llu against %llu\n", 1516 mlog(0, "Checking %llu against %llu\n",
1314 (unsigned long long)blkoff, 1517 (unsigned long long)blkoff,
1315 (unsigned long long)max_block); 1518 (unsigned long long)max_block);
@@ -1321,16 +1524,14 @@ static int ocfs2_cluster_group_search(struct inode *inode,
1321 * return success, but we still want to return 1524 * return success, but we still want to return
1322 * -ENOSPC unless it found the minimum number 1525 * -ENOSPC unless it found the minimum number
1323 * of bits. */ 1526 * of bits. */
1324 if (min_bits <= tmp_found) { 1527 if (min_bits <= res->sr_bits)
1325 *bit_off = tmp_off;
1326 *bits_found = tmp_found;
1327 search = 0; /* success */ 1528 search = 0; /* success */
1328 } else if (tmp_found) { 1529 else if (res->sr_bits) {
1329 /* 1530 /*
1330 * Don't show bits which we'll be returning 1531 * Don't show bits which we'll be returning
1331 * for allocation to the local alloc bitmap. 1532 * for allocation to the local alloc bitmap.
1332 */ 1533 */
1333 ocfs2_local_alloc_seen_free_bits(osb, tmp_found); 1534 ocfs2_local_alloc_seen_free_bits(osb, res->sr_bits);
1334 } 1535 }
1335 } 1536 }
1336 1537
@@ -1341,7 +1542,7 @@ static int ocfs2_block_group_search(struct inode *inode,
1341 struct buffer_head *group_bh, 1542 struct buffer_head *group_bh,
1342 u32 bits_wanted, u32 min_bits, 1543 u32 bits_wanted, u32 min_bits,
1343 u64 max_block, 1544 u64 max_block,
1344 u16 *bit_off, u16 *bits_found) 1545 struct ocfs2_suballoc_result *res)
1345{ 1546{
1346 int ret = -ENOSPC; 1547 int ret = -ENOSPC;
1347 u64 blkoff; 1548 u64 blkoff;
@@ -1354,10 +1555,10 @@ static int ocfs2_block_group_search(struct inode *inode,
1354 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb), 1555 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
1355 group_bh, bits_wanted, 1556 group_bh, bits_wanted,
1356 le16_to_cpu(bg->bg_bits), 1557 le16_to_cpu(bg->bg_bits),
1357 bit_off, bits_found); 1558 res);
1358 if (!ret && max_block) { 1559 if (!ret && max_block) {
1359 blkoff = le64_to_cpu(bg->bg_blkno) + *bit_off + 1560 blkoff = le64_to_cpu(bg->bg_blkno) +
1360 *bits_found; 1561 res->sr_bit_offset + res->sr_bits;
1361 mlog(0, "Checking %llu against %llu\n", 1562 mlog(0, "Checking %llu against %llu\n",
1362 (unsigned long long)blkoff, 1563 (unsigned long long)blkoff,
1363 (unsigned long long)max_block); 1564 (unsigned long long)max_block);
@@ -1390,33 +1591,76 @@ static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
1390 tmp_used = le32_to_cpu(di->id1.bitmap1.i_used); 1591 tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
1391 di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used); 1592 di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
1392 le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits); 1593 le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
1393 1594 ocfs2_journal_dirty(handle, di_bh);
1394 ret = ocfs2_journal_dirty(handle, di_bh);
1395 if (ret < 0)
1396 mlog_errno(ret);
1397 1595
1398out: 1596out:
1399 return ret; 1597 return ret;
1400} 1598}
1401 1599
1600static int ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result *res,
1601 struct ocfs2_extent_rec *rec,
1602 struct ocfs2_chain_list *cl)
1603{
1604 unsigned int bpc = le16_to_cpu(cl->cl_bpc);
1605 unsigned int bitoff = le32_to_cpu(rec->e_cpos) * bpc;
1606 unsigned int bitcount = le32_to_cpu(rec->e_leaf_clusters) * bpc;
1607
1608 if (res->sr_bit_offset < bitoff)
1609 return 0;
1610 if (res->sr_bit_offset >= (bitoff + bitcount))
1611 return 0;
1612 res->sr_blkno = le64_to_cpu(rec->e_blkno) +
1613 (res->sr_bit_offset - bitoff);
1614 if ((res->sr_bit_offset + res->sr_bits) > (bitoff + bitcount))
1615 res->sr_bits = (bitoff + bitcount) - res->sr_bit_offset;
1616 return 1;
1617}
1618
1619static void ocfs2_bg_discontig_fix_result(struct ocfs2_alloc_context *ac,
1620 struct ocfs2_group_desc *bg,
1621 struct ocfs2_suballoc_result *res)
1622{
1623 int i;
1624 u64 bg_blkno = res->sr_bg_blkno; /* Save off */
1625 struct ocfs2_extent_rec *rec;
1626 struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
1627 struct ocfs2_chain_list *cl = &di->id2.i_chain;
1628
1629 if (ocfs2_is_cluster_bitmap(ac->ac_inode)) {
1630 res->sr_blkno = 0;
1631 return;
1632 }
1633
1634 res->sr_blkno = res->sr_bg_blkno + res->sr_bit_offset;
1635 res->sr_bg_blkno = 0; /* Clear it for contig block groups */
1636 if (!ocfs2_supports_discontig_bg(OCFS2_SB(ac->ac_inode->i_sb)) ||
1637 !bg->bg_list.l_next_free_rec)
1638 return;
1639
1640 for (i = 0; i < le16_to_cpu(bg->bg_list.l_next_free_rec); i++) {
1641 rec = &bg->bg_list.l_recs[i];
1642 if (ocfs2_bg_discontig_fix_by_rec(res, rec, cl)) {
1643 res->sr_bg_blkno = bg_blkno; /* Restore */
1644 break;
1645 }
1646 }
1647}
1648
1402static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac, 1649static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
1403 handle_t *handle, 1650 handle_t *handle,
1404 u32 bits_wanted, 1651 u32 bits_wanted,
1405 u32 min_bits, 1652 u32 min_bits,
1406 u16 *bit_off, 1653 struct ocfs2_suballoc_result *res,
1407 unsigned int *num_bits,
1408 u64 gd_blkno,
1409 u16 *bits_left) 1654 u16 *bits_left)
1410{ 1655{
1411 int ret; 1656 int ret;
1412 u16 found;
1413 struct buffer_head *group_bh = NULL; 1657 struct buffer_head *group_bh = NULL;
1414 struct ocfs2_group_desc *gd; 1658 struct ocfs2_group_desc *gd;
1415 struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data; 1659 struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
1416 struct inode *alloc_inode = ac->ac_inode; 1660 struct inode *alloc_inode = ac->ac_inode;
1417 1661
1418 ret = ocfs2_read_group_descriptor(alloc_inode, di, gd_blkno, 1662 ret = ocfs2_read_group_descriptor(alloc_inode, di,
1419 &group_bh); 1663 res->sr_bg_blkno, &group_bh);
1420 if (ret < 0) { 1664 if (ret < 0) {
1421 mlog_errno(ret); 1665 mlog_errno(ret);
1422 return ret; 1666 return ret;
@@ -1424,17 +1668,18 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
1424 1668
1425 gd = (struct ocfs2_group_desc *) group_bh->b_data; 1669 gd = (struct ocfs2_group_desc *) group_bh->b_data;
1426 ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits, 1670 ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
1427 ac->ac_max_block, bit_off, &found); 1671 ac->ac_max_block, res);
1428 if (ret < 0) { 1672 if (ret < 0) {
1429 if (ret != -ENOSPC) 1673 if (ret != -ENOSPC)
1430 mlog_errno(ret); 1674 mlog_errno(ret);
1431 goto out; 1675 goto out;
1432 } 1676 }
1433 1677
1434 *num_bits = found; 1678 if (!ret)
1679 ocfs2_bg_discontig_fix_result(ac, gd, res);
1435 1680
1436 ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh, 1681 ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh,
1437 *num_bits, 1682 res->sr_bits,
1438 le16_to_cpu(gd->bg_chain)); 1683 le16_to_cpu(gd->bg_chain));
1439 if (ret < 0) { 1684 if (ret < 0) {
1440 mlog_errno(ret); 1685 mlog_errno(ret);
@@ -1442,7 +1687,7 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
1442 } 1687 }
1443 1688
1444 ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh, 1689 ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh,
1445 *bit_off, *num_bits); 1690 res->sr_bit_offset, res->sr_bits);
1446 if (ret < 0) 1691 if (ret < 0)
1447 mlog_errno(ret); 1692 mlog_errno(ret);
1448 1693
@@ -1458,13 +1703,11 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1458 handle_t *handle, 1703 handle_t *handle,
1459 u32 bits_wanted, 1704 u32 bits_wanted,
1460 u32 min_bits, 1705 u32 min_bits,
1461 u16 *bit_off, 1706 struct ocfs2_suballoc_result *res,
1462 unsigned int *num_bits,
1463 u64 *bg_blkno,
1464 u16 *bits_left) 1707 u16 *bits_left)
1465{ 1708{
1466 int status; 1709 int status;
1467 u16 chain, tmp_bits; 1710 u16 chain;
1468 u32 tmp_used; 1711 u32 tmp_used;
1469 u64 next_group; 1712 u64 next_group;
1470 struct inode *alloc_inode = ac->ac_inode; 1713 struct inode *alloc_inode = ac->ac_inode;
@@ -1493,8 +1736,8 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1493 * the 1st group with any empty bits. */ 1736 * the 1st group with any empty bits. */
1494 while ((status = ac->ac_group_search(alloc_inode, group_bh, 1737 while ((status = ac->ac_group_search(alloc_inode, group_bh,
1495 bits_wanted, min_bits, 1738 bits_wanted, min_bits,
1496 ac->ac_max_block, bit_off, 1739 ac->ac_max_block,
1497 &tmp_bits)) == -ENOSPC) { 1740 res)) == -ENOSPC) {
1498 if (!bg->bg_next_group) 1741 if (!bg->bg_next_group)
1499 break; 1742 break;
1500 1743
@@ -1519,11 +1762,14 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1519 } 1762 }
1520 1763
1521 mlog(0, "alloc succeeds: we give %u bits from block group %llu\n", 1764 mlog(0, "alloc succeeds: we give %u bits from block group %llu\n",
1522 tmp_bits, (unsigned long long)le64_to_cpu(bg->bg_blkno)); 1765 res->sr_bits, (unsigned long long)le64_to_cpu(bg->bg_blkno));
1523 1766
1524 *num_bits = tmp_bits; 1767 res->sr_bg_blkno = le64_to_cpu(bg->bg_blkno);
1768
1769 BUG_ON(res->sr_bits == 0);
1770 if (!status)
1771 ocfs2_bg_discontig_fix_result(ac, bg, res);
1525 1772
1526 BUG_ON(*num_bits == 0);
1527 1773
1528 /* 1774 /*
1529 * Keep track of previous block descriptor read. When 1775 * Keep track of previous block descriptor read. When
@@ -1540,7 +1786,7 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1540 */ 1786 */
1541 if (ac->ac_allow_chain_relink && 1787 if (ac->ac_allow_chain_relink &&
1542 (prev_group_bh) && 1788 (prev_group_bh) &&
1543 (ocfs2_block_group_reasonably_empty(bg, *num_bits))) { 1789 (ocfs2_block_group_reasonably_empty(bg, res->sr_bits))) {
1544 status = ocfs2_relink_block_group(handle, alloc_inode, 1790 status = ocfs2_relink_block_group(handle, alloc_inode,
1545 ac->ac_bh, group_bh, 1791 ac->ac_bh, group_bh,
1546 prev_group_bh, chain); 1792 prev_group_bh, chain);
@@ -1562,31 +1808,24 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1562 } 1808 }
1563 1809
1564 tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used); 1810 tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
1565 fe->id1.bitmap1.i_used = cpu_to_le32(*num_bits + tmp_used); 1811 fe->id1.bitmap1.i_used = cpu_to_le32(res->sr_bits + tmp_used);
1566 le32_add_cpu(&cl->cl_recs[chain].c_free, -(*num_bits)); 1812 le32_add_cpu(&cl->cl_recs[chain].c_free, -res->sr_bits);
1567 1813 ocfs2_journal_dirty(handle, ac->ac_bh);
1568 status = ocfs2_journal_dirty(handle,
1569 ac->ac_bh);
1570 if (status < 0) {
1571 mlog_errno(status);
1572 goto bail;
1573 }
1574 1814
1575 status = ocfs2_block_group_set_bits(handle, 1815 status = ocfs2_block_group_set_bits(handle,
1576 alloc_inode, 1816 alloc_inode,
1577 bg, 1817 bg,
1578 group_bh, 1818 group_bh,
1579 *bit_off, 1819 res->sr_bit_offset,
1580 *num_bits); 1820 res->sr_bits);
1581 if (status < 0) { 1821 if (status < 0) {
1582 mlog_errno(status); 1822 mlog_errno(status);
1583 goto bail; 1823 goto bail;
1584 } 1824 }
1585 1825
1586 mlog(0, "Allocated %u bits from suballocator %llu\n", *num_bits, 1826 mlog(0, "Allocated %u bits from suballocator %llu\n", res->sr_bits,
1587 (unsigned long long)le64_to_cpu(fe->i_blkno)); 1827 (unsigned long long)le64_to_cpu(fe->i_blkno));
1588 1828
1589 *bg_blkno = le64_to_cpu(bg->bg_blkno);
1590 *bits_left = le16_to_cpu(bg->bg_free_bits_count); 1829 *bits_left = le16_to_cpu(bg->bg_free_bits_count);
1591bail: 1830bail:
1592 brelse(group_bh); 1831 brelse(group_bh);
@@ -1597,19 +1836,15 @@ bail:
1597} 1836}
1598 1837
1599/* will give out up to bits_wanted contiguous bits. */ 1838/* will give out up to bits_wanted contiguous bits. */
1600static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb, 1839static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
1601 struct ocfs2_alloc_context *ac,
1602 handle_t *handle, 1840 handle_t *handle,
1603 u32 bits_wanted, 1841 u32 bits_wanted,
1604 u32 min_bits, 1842 u32 min_bits,
1605 u16 *bit_off, 1843 struct ocfs2_suballoc_result *res)
1606 unsigned int *num_bits,
1607 u64 *bg_blkno)
1608{ 1844{
1609 int status; 1845 int status;
1610 u16 victim, i; 1846 u16 victim, i;
1611 u16 bits_left = 0; 1847 u16 bits_left = 0;
1612 u64 hint_blkno = ac->ac_last_group;
1613 struct ocfs2_chain_list *cl; 1848 struct ocfs2_chain_list *cl;
1614 struct ocfs2_dinode *fe; 1849 struct ocfs2_dinode *fe;
1615 1850
@@ -1627,7 +1862,8 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
1627 1862
1628 if (le32_to_cpu(fe->id1.bitmap1.i_used) >= 1863 if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
1629 le32_to_cpu(fe->id1.bitmap1.i_total)) { 1864 le32_to_cpu(fe->id1.bitmap1.i_total)) {
1630 ocfs2_error(osb->sb, "Chain allocator dinode %llu has %u used " 1865 ocfs2_error(ac->ac_inode->i_sb,
1866 "Chain allocator dinode %llu has %u used "
1631 "bits but only %u total.", 1867 "bits but only %u total.",
1632 (unsigned long long)le64_to_cpu(fe->i_blkno), 1868 (unsigned long long)le64_to_cpu(fe->i_blkno),
1633 le32_to_cpu(fe->id1.bitmap1.i_used), 1869 le32_to_cpu(fe->id1.bitmap1.i_used),
@@ -1636,22 +1872,16 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
1636 goto bail; 1872 goto bail;
1637 } 1873 }
1638 1874
1639 if (hint_blkno) { 1875 res->sr_bg_blkno = ac->ac_last_group;
1876 if (res->sr_bg_blkno) {
1640 /* Attempt to short-circuit the usual search mechanism 1877 /* Attempt to short-circuit the usual search mechanism
1641 * by jumping straight to the most recently used 1878 * by jumping straight to the most recently used
1642 * allocation group. This helps us mantain some 1879 * allocation group. This helps us mantain some
1643 * contiguousness across allocations. */ 1880 * contiguousness across allocations. */
1644 status = ocfs2_search_one_group(ac, handle, bits_wanted, 1881 status = ocfs2_search_one_group(ac, handle, bits_wanted,
1645 min_bits, bit_off, num_bits, 1882 min_bits, res, &bits_left);
1646 hint_blkno, &bits_left); 1883 if (!status)
1647 if (!status) {
1648 /* Be careful to update *bg_blkno here as the
1649 * caller is expecting it to be filled in, and
1650 * ocfs2_search_one_group() won't do that for
1651 * us. */
1652 *bg_blkno = hint_blkno;
1653 goto set_hint; 1884 goto set_hint;
1654 }
1655 if (status < 0 && status != -ENOSPC) { 1885 if (status < 0 && status != -ENOSPC) {
1656 mlog_errno(status); 1886 mlog_errno(status);
1657 goto bail; 1887 goto bail;
@@ -1664,8 +1894,8 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
1664 ac->ac_chain = victim; 1894 ac->ac_chain = victim;
1665 ac->ac_allow_chain_relink = 1; 1895 ac->ac_allow_chain_relink = 1;
1666 1896
1667 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, bit_off, 1897 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
1668 num_bits, bg_blkno, &bits_left); 1898 res, &bits_left);
1669 if (!status) 1899 if (!status)
1670 goto set_hint; 1900 goto set_hint;
1671 if (status < 0 && status != -ENOSPC) { 1901 if (status < 0 && status != -ENOSPC) {
@@ -1689,8 +1919,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
1689 1919
1690 ac->ac_chain = i; 1920 ac->ac_chain = i;
1691 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, 1921 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
1692 bit_off, num_bits, bg_blkno, 1922 res, &bits_left);
1693 &bits_left);
1694 if (!status) 1923 if (!status)
1695 break; 1924 break;
1696 if (status < 0 && status != -ENOSPC) { 1925 if (status < 0 && status != -ENOSPC) {
@@ -1707,7 +1936,7 @@ set_hint:
1707 if (bits_left < min_bits) 1936 if (bits_left < min_bits)
1708 ac->ac_last_group = 0; 1937 ac->ac_last_group = 0;
1709 else 1938 else
1710 ac->ac_last_group = *bg_blkno; 1939 ac->ac_last_group = res->sr_bg_blkno;
1711 } 1940 }
1712 1941
1713bail: 1942bail:
@@ -1715,37 +1944,37 @@ bail:
1715 return status; 1944 return status;
1716} 1945}
1717 1946
1718int ocfs2_claim_metadata(struct ocfs2_super *osb, 1947int ocfs2_claim_metadata(handle_t *handle,
1719 handle_t *handle,
1720 struct ocfs2_alloc_context *ac, 1948 struct ocfs2_alloc_context *ac,
1721 u32 bits_wanted, 1949 u32 bits_wanted,
1950 u64 *suballoc_loc,
1722 u16 *suballoc_bit_start, 1951 u16 *suballoc_bit_start,
1723 unsigned int *num_bits, 1952 unsigned int *num_bits,
1724 u64 *blkno_start) 1953 u64 *blkno_start)
1725{ 1954{
1726 int status; 1955 int status;
1727 u64 bg_blkno; 1956 struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
1728 1957
1729 BUG_ON(!ac); 1958 BUG_ON(!ac);
1730 BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted)); 1959 BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted));
1731 BUG_ON(ac->ac_which != OCFS2_AC_USE_META); 1960 BUG_ON(ac->ac_which != OCFS2_AC_USE_META);
1732 1961
1733 status = ocfs2_claim_suballoc_bits(osb, 1962 status = ocfs2_claim_suballoc_bits(ac,
1734 ac,
1735 handle, 1963 handle,
1736 bits_wanted, 1964 bits_wanted,
1737 1, 1965 1,
1738 suballoc_bit_start, 1966 &res);
1739 num_bits,
1740 &bg_blkno);
1741 if (status < 0) { 1967 if (status < 0) {
1742 mlog_errno(status); 1968 mlog_errno(status);
1743 goto bail; 1969 goto bail;
1744 } 1970 }
1745 atomic_inc(&osb->alloc_stats.bg_allocs); 1971 atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
1746 1972
1747 *blkno_start = bg_blkno + (u64) *suballoc_bit_start; 1973 *suballoc_loc = res.sr_bg_blkno;
1748 ac->ac_bits_given += (*num_bits); 1974 *suballoc_bit_start = res.sr_bit_offset;
1975 *blkno_start = res.sr_blkno;
1976 ac->ac_bits_given += res.sr_bits;
1977 *num_bits = res.sr_bits;
1749 status = 0; 1978 status = 0;
1750bail: 1979bail:
1751 mlog_exit(status); 1980 mlog_exit(status);
@@ -1753,10 +1982,10 @@ bail:
1753} 1982}
1754 1983
1755static void ocfs2_init_inode_ac_group(struct inode *dir, 1984static void ocfs2_init_inode_ac_group(struct inode *dir,
1756 struct buffer_head *parent_fe_bh, 1985 struct buffer_head *parent_di_bh,
1757 struct ocfs2_alloc_context *ac) 1986 struct ocfs2_alloc_context *ac)
1758{ 1987{
1759 struct ocfs2_dinode *fe = (struct ocfs2_dinode *)parent_fe_bh->b_data; 1988 struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_di_bh->b_data;
1760 /* 1989 /*
1761 * Try to allocate inodes from some specific group. 1990 * Try to allocate inodes from some specific group.
1762 * 1991 *
@@ -1770,10 +1999,14 @@ static void ocfs2_init_inode_ac_group(struct inode *dir,
1770 if (OCFS2_I(dir)->ip_last_used_group && 1999 if (OCFS2_I(dir)->ip_last_used_group &&
1771 OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot) 2000 OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot)
1772 ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group; 2001 ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group;
1773 else if (le16_to_cpu(fe->i_suballoc_slot) == ac->ac_alloc_slot) 2002 else if (le16_to_cpu(di->i_suballoc_slot) == ac->ac_alloc_slot) {
1774 ac->ac_last_group = ocfs2_which_suballoc_group( 2003 if (di->i_suballoc_loc)
1775 le64_to_cpu(fe->i_blkno), 2004 ac->ac_last_group = le64_to_cpu(di->i_suballoc_loc);
1776 le16_to_cpu(fe->i_suballoc_bit)); 2005 else
2006 ac->ac_last_group = ocfs2_which_suballoc_group(
2007 le64_to_cpu(di->i_blkno),
2008 le16_to_cpu(di->i_suballoc_bit));
2009 }
1777} 2010}
1778 2011
1779static inline void ocfs2_save_inode_ac_group(struct inode *dir, 2012static inline void ocfs2_save_inode_ac_group(struct inode *dir,
@@ -1783,17 +2016,16 @@ static inline void ocfs2_save_inode_ac_group(struct inode *dir,
1783 OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot; 2016 OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot;
1784} 2017}
1785 2018
1786int ocfs2_claim_new_inode(struct ocfs2_super *osb, 2019int ocfs2_claim_new_inode(handle_t *handle,
1787 handle_t *handle,
1788 struct inode *dir, 2020 struct inode *dir,
1789 struct buffer_head *parent_fe_bh, 2021 struct buffer_head *parent_fe_bh,
1790 struct ocfs2_alloc_context *ac, 2022 struct ocfs2_alloc_context *ac,
2023 u64 *suballoc_loc,
1791 u16 *suballoc_bit, 2024 u16 *suballoc_bit,
1792 u64 *fe_blkno) 2025 u64 *fe_blkno)
1793{ 2026{
1794 int status; 2027 int status;
1795 unsigned int num_bits; 2028 struct ocfs2_suballoc_result res;
1796 u64 bg_blkno;
1797 2029
1798 mlog_entry_void(); 2030 mlog_entry_void();
1799 2031
@@ -1804,23 +2036,22 @@ int ocfs2_claim_new_inode(struct ocfs2_super *osb,
1804 2036
1805 ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac); 2037 ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
1806 2038
1807 status = ocfs2_claim_suballoc_bits(osb, 2039 status = ocfs2_claim_suballoc_bits(ac,
1808 ac,
1809 handle, 2040 handle,
1810 1, 2041 1,
1811 1, 2042 1,
1812 suballoc_bit, 2043 &res);
1813 &num_bits,
1814 &bg_blkno);
1815 if (status < 0) { 2044 if (status < 0) {
1816 mlog_errno(status); 2045 mlog_errno(status);
1817 goto bail; 2046 goto bail;
1818 } 2047 }
1819 atomic_inc(&osb->alloc_stats.bg_allocs); 2048 atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
1820 2049
1821 BUG_ON(num_bits != 1); 2050 BUG_ON(res.sr_bits != 1);
1822 2051
1823 *fe_blkno = bg_blkno + (u64) (*suballoc_bit); 2052 *suballoc_loc = res.sr_bg_blkno;
2053 *suballoc_bit = res.sr_bit_offset;
2054 *fe_blkno = res.sr_blkno;
1824 ac->ac_bits_given++; 2055 ac->ac_bits_given++;
1825 ocfs2_save_inode_ac_group(dir, ac); 2056 ocfs2_save_inode_ac_group(dir, ac);
1826 status = 0; 2057 status = 0;
@@ -1890,8 +2121,7 @@ static inline void ocfs2_block_to_cluster_group(struct inode *inode,
1890 * contig. allocation, set to '1' to indicate we can deal with extents 2121 * contig. allocation, set to '1' to indicate we can deal with extents
1891 * of any size. 2122 * of any size.
1892 */ 2123 */
1893int __ocfs2_claim_clusters(struct ocfs2_super *osb, 2124int __ocfs2_claim_clusters(handle_t *handle,
1894 handle_t *handle,
1895 struct ocfs2_alloc_context *ac, 2125 struct ocfs2_alloc_context *ac,
1896 u32 min_clusters, 2126 u32 min_clusters,
1897 u32 max_clusters, 2127 u32 max_clusters,
@@ -1900,8 +2130,8 @@ int __ocfs2_claim_clusters(struct ocfs2_super *osb,
1900{ 2130{
1901 int status; 2131 int status;
1902 unsigned int bits_wanted = max_clusters; 2132 unsigned int bits_wanted = max_clusters;
1903 u64 bg_blkno = 0; 2133 struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
1904 u16 bg_bit_off; 2134 struct ocfs2_super *osb = OCFS2_SB(ac->ac_inode->i_sb);
1905 2135
1906 mlog_entry_void(); 2136 mlog_entry_void();
1907 2137
@@ -1911,6 +2141,8 @@ int __ocfs2_claim_clusters(struct ocfs2_super *osb,
1911 && ac->ac_which != OCFS2_AC_USE_MAIN); 2141 && ac->ac_which != OCFS2_AC_USE_MAIN);
1912 2142
1913 if (ac->ac_which == OCFS2_AC_USE_LOCAL) { 2143 if (ac->ac_which == OCFS2_AC_USE_LOCAL) {
2144 WARN_ON(min_clusters > 1);
2145
1914 status = ocfs2_claim_local_alloc_bits(osb, 2146 status = ocfs2_claim_local_alloc_bits(osb,
1915 handle, 2147 handle,
1916 ac, 2148 ac,
@@ -1933,20 +2165,19 @@ int __ocfs2_claim_clusters(struct ocfs2_super *osb,
1933 if (bits_wanted > (osb->bitmap_cpg - 1)) 2165 if (bits_wanted > (osb->bitmap_cpg - 1))
1934 bits_wanted = osb->bitmap_cpg - 1; 2166 bits_wanted = osb->bitmap_cpg - 1;
1935 2167
1936 status = ocfs2_claim_suballoc_bits(osb, 2168 status = ocfs2_claim_suballoc_bits(ac,
1937 ac,
1938 handle, 2169 handle,
1939 bits_wanted, 2170 bits_wanted,
1940 min_clusters, 2171 min_clusters,
1941 &bg_bit_off, 2172 &res);
1942 num_clusters,
1943 &bg_blkno);
1944 if (!status) { 2173 if (!status) {
2174 BUG_ON(res.sr_blkno); /* cluster alloc can't set */
1945 *cluster_start = 2175 *cluster_start =
1946 ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode, 2176 ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode,
1947 bg_blkno, 2177 res.sr_bg_blkno,
1948 bg_bit_off); 2178 res.sr_bit_offset);
1949 atomic_inc(&osb->alloc_stats.bitmap_data); 2179 atomic_inc(&osb->alloc_stats.bitmap_data);
2180 *num_clusters = res.sr_bits;
1950 } 2181 }
1951 } 2182 }
1952 if (status < 0) { 2183 if (status < 0) {
@@ -1962,8 +2193,7 @@ bail:
1962 return status; 2193 return status;
1963} 2194}
1964 2195
1965int ocfs2_claim_clusters(struct ocfs2_super *osb, 2196int ocfs2_claim_clusters(handle_t *handle,
1966 handle_t *handle,
1967 struct ocfs2_alloc_context *ac, 2197 struct ocfs2_alloc_context *ac,
1968 u32 min_clusters, 2198 u32 min_clusters,
1969 u32 *cluster_start, 2199 u32 *cluster_start,
@@ -1971,22 +2201,22 @@ int ocfs2_claim_clusters(struct ocfs2_super *osb,
1971{ 2201{
1972 unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given; 2202 unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
1973 2203
1974 return __ocfs2_claim_clusters(osb, handle, ac, min_clusters, 2204 return __ocfs2_claim_clusters(handle, ac, min_clusters,
1975 bits_wanted, cluster_start, num_clusters); 2205 bits_wanted, cluster_start, num_clusters);
1976} 2206}
1977 2207
1978static inline int ocfs2_block_group_clear_bits(handle_t *handle, 2208static int ocfs2_block_group_clear_bits(handle_t *handle,
1979 struct inode *alloc_inode, 2209 struct inode *alloc_inode,
1980 struct ocfs2_group_desc *bg, 2210 struct ocfs2_group_desc *bg,
1981 struct buffer_head *group_bh, 2211 struct buffer_head *group_bh,
1982 unsigned int bit_off, 2212 unsigned int bit_off,
1983 unsigned int num_bits) 2213 unsigned int num_bits,
2214 void (*undo_fn)(unsigned int bit,
2215 unsigned long *bmap))
1984{ 2216{
1985 int status; 2217 int status;
1986 unsigned int tmp; 2218 unsigned int tmp;
1987 int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
1988 struct ocfs2_group_desc *undo_bg = NULL; 2219 struct ocfs2_group_desc *undo_bg = NULL;
1989 int cluster_bitmap = 0;
1990 2220
1991 mlog_entry_void(); 2221 mlog_entry_void();
1992 2222
@@ -1996,20 +2226,18 @@ static inline int ocfs2_block_group_clear_bits(handle_t *handle,
1996 2226
1997 mlog(0, "off = %u, num = %u\n", bit_off, num_bits); 2227 mlog(0, "off = %u, num = %u\n", bit_off, num_bits);
1998 2228
1999 if (ocfs2_is_cluster_bitmap(alloc_inode)) 2229 BUG_ON(undo_fn && !ocfs2_is_cluster_bitmap(alloc_inode));
2000 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
2001
2002 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), 2230 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
2003 group_bh, journal_type); 2231 group_bh,
2232 undo_fn ?
2233 OCFS2_JOURNAL_ACCESS_UNDO :
2234 OCFS2_JOURNAL_ACCESS_WRITE);
2004 if (status < 0) { 2235 if (status < 0) {
2005 mlog_errno(status); 2236 mlog_errno(status);
2006 goto bail; 2237 goto bail;
2007 } 2238 }
2008 2239
2009 if (ocfs2_is_cluster_bitmap(alloc_inode)) 2240 if (undo_fn) {
2010 cluster_bitmap = 1;
2011
2012 if (cluster_bitmap) {
2013 jbd_lock_bh_state(group_bh); 2241 jbd_lock_bh_state(group_bh);
2014 undo_bg = (struct ocfs2_group_desc *) 2242 undo_bg = (struct ocfs2_group_desc *)
2015 bh2jh(group_bh)->b_committed_data; 2243 bh2jh(group_bh)->b_committed_data;
@@ -2020,18 +2248,16 @@ static inline int ocfs2_block_group_clear_bits(handle_t *handle,
2020 while(tmp--) { 2248 while(tmp--) {
2021 ocfs2_clear_bit((bit_off + tmp), 2249 ocfs2_clear_bit((bit_off + tmp),
2022 (unsigned long *) bg->bg_bitmap); 2250 (unsigned long *) bg->bg_bitmap);
2023 if (cluster_bitmap) 2251 if (undo_fn)
2024 ocfs2_set_bit(bit_off + tmp, 2252 undo_fn(bit_off + tmp,
2025 (unsigned long *) undo_bg->bg_bitmap); 2253 (unsigned long *) undo_bg->bg_bitmap);
2026 } 2254 }
2027 le16_add_cpu(&bg->bg_free_bits_count, num_bits); 2255 le16_add_cpu(&bg->bg_free_bits_count, num_bits);
2028 2256
2029 if (cluster_bitmap) 2257 if (undo_fn)
2030 jbd_unlock_bh_state(group_bh); 2258 jbd_unlock_bh_state(group_bh);
2031 2259
2032 status = ocfs2_journal_dirty(handle, group_bh); 2260 ocfs2_journal_dirty(handle, group_bh);
2033 if (status < 0)
2034 mlog_errno(status);
2035bail: 2261bail:
2036 return status; 2262 return status;
2037} 2263}
@@ -2039,12 +2265,14 @@ bail:
2039/* 2265/*
2040 * expects the suballoc inode to already be locked. 2266 * expects the suballoc inode to already be locked.
2041 */ 2267 */
2042int ocfs2_free_suballoc_bits(handle_t *handle, 2268static int _ocfs2_free_suballoc_bits(handle_t *handle,
2043 struct inode *alloc_inode, 2269 struct inode *alloc_inode,
2044 struct buffer_head *alloc_bh, 2270 struct buffer_head *alloc_bh,
2045 unsigned int start_bit, 2271 unsigned int start_bit,
2046 u64 bg_blkno, 2272 u64 bg_blkno,
2047 unsigned int count) 2273 unsigned int count,
2274 void (*undo_fn)(unsigned int bit,
2275 unsigned long *bitmap))
2048{ 2276{
2049 int status = 0; 2277 int status = 0;
2050 u32 tmp_used; 2278 u32 tmp_used;
@@ -2079,7 +2307,7 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
2079 2307
2080 status = ocfs2_block_group_clear_bits(handle, alloc_inode, 2308 status = ocfs2_block_group_clear_bits(handle, alloc_inode,
2081 group, group_bh, 2309 group, group_bh,
2082 start_bit, count); 2310 start_bit, count, undo_fn);
2083 if (status < 0) { 2311 if (status < 0) {
2084 mlog_errno(status); 2312 mlog_errno(status);
2085 goto bail; 2313 goto bail;
@@ -2096,12 +2324,7 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
2096 count); 2324 count);
2097 tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used); 2325 tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
2098 fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count); 2326 fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count);
2099 2327 ocfs2_journal_dirty(handle, alloc_bh);
2100 status = ocfs2_journal_dirty(handle, alloc_bh);
2101 if (status < 0) {
2102 mlog_errno(status);
2103 goto bail;
2104 }
2105 2328
2106bail: 2329bail:
2107 brelse(group_bh); 2330 brelse(group_bh);
@@ -2110,6 +2333,17 @@ bail:
2110 return status; 2333 return status;
2111} 2334}
2112 2335
2336int ocfs2_free_suballoc_bits(handle_t *handle,
2337 struct inode *alloc_inode,
2338 struct buffer_head *alloc_bh,
2339 unsigned int start_bit,
2340 u64 bg_blkno,
2341 unsigned int count)
2342{
2343 return _ocfs2_free_suballoc_bits(handle, alloc_inode, alloc_bh,
2344 start_bit, bg_blkno, count, NULL);
2345}
2346
2113int ocfs2_free_dinode(handle_t *handle, 2347int ocfs2_free_dinode(handle_t *handle,
2114 struct inode *inode_alloc_inode, 2348 struct inode *inode_alloc_inode,
2115 struct buffer_head *inode_alloc_bh, 2349 struct buffer_head *inode_alloc_bh,
@@ -2119,15 +2353,19 @@ int ocfs2_free_dinode(handle_t *handle,
2119 u16 bit = le16_to_cpu(di->i_suballoc_bit); 2353 u16 bit = le16_to_cpu(di->i_suballoc_bit);
2120 u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit); 2354 u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
2121 2355
2356 if (di->i_suballoc_loc)
2357 bg_blkno = le64_to_cpu(di->i_suballoc_loc);
2122 return ocfs2_free_suballoc_bits(handle, inode_alloc_inode, 2358 return ocfs2_free_suballoc_bits(handle, inode_alloc_inode,
2123 inode_alloc_bh, bit, bg_blkno, 1); 2359 inode_alloc_bh, bit, bg_blkno, 1);
2124} 2360}
2125 2361
2126int ocfs2_free_clusters(handle_t *handle, 2362static int _ocfs2_free_clusters(handle_t *handle,
2127 struct inode *bitmap_inode, 2363 struct inode *bitmap_inode,
2128 struct buffer_head *bitmap_bh, 2364 struct buffer_head *bitmap_bh,
2129 u64 start_blk, 2365 u64 start_blk,
2130 unsigned int num_clusters) 2366 unsigned int num_clusters,
2367 void (*undo_fn)(unsigned int bit,
2368 unsigned long *bitmap))
2131{ 2369{
2132 int status; 2370 int status;
2133 u16 bg_start_bit; 2371 u16 bg_start_bit;
@@ -2154,9 +2392,9 @@ int ocfs2_free_clusters(handle_t *handle,
2154 mlog(0, "bg_blkno = %llu, bg_start_bit = %u\n", 2392 mlog(0, "bg_blkno = %llu, bg_start_bit = %u\n",
2155 (unsigned long long)bg_blkno, bg_start_bit); 2393 (unsigned long long)bg_blkno, bg_start_bit);
2156 2394
2157 status = ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh, 2395 status = _ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
2158 bg_start_bit, bg_blkno, 2396 bg_start_bit, bg_blkno,
2159 num_clusters); 2397 num_clusters, undo_fn);
2160 if (status < 0) { 2398 if (status < 0) {
2161 mlog_errno(status); 2399 mlog_errno(status);
2162 goto out; 2400 goto out;
@@ -2170,6 +2408,32 @@ out:
2170 return status; 2408 return status;
2171} 2409}
2172 2410
2411int ocfs2_free_clusters(handle_t *handle,
2412 struct inode *bitmap_inode,
2413 struct buffer_head *bitmap_bh,
2414 u64 start_blk,
2415 unsigned int num_clusters)
2416{
2417 return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
2418 start_blk, num_clusters,
2419 _ocfs2_set_bit);
2420}
2421
2422/*
2423 * Give never-used clusters back to the global bitmap. We don't need
2424 * to protect these bits in the undo buffer.
2425 */
2426int ocfs2_release_clusters(handle_t *handle,
2427 struct inode *bitmap_inode,
2428 struct buffer_head *bitmap_bh,
2429 u64 start_blk,
2430 unsigned int num_clusters)
2431{
2432 return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
2433 start_blk, num_clusters,
2434 _ocfs2_clear_bit);
2435}
2436
2173static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg) 2437static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg)
2174{ 2438{
2175 printk("Block Group:\n"); 2439 printk("Block Group:\n");
@@ -2360,7 +2624,7 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
2360 struct buffer_head *alloc_bh, u64 blkno, 2624 struct buffer_head *alloc_bh, u64 blkno,
2361 u16 bit, int *res) 2625 u16 bit, int *res)
2362{ 2626{
2363 struct ocfs2_dinode *alloc_fe; 2627 struct ocfs2_dinode *alloc_di;
2364 struct ocfs2_group_desc *group; 2628 struct ocfs2_group_desc *group;
2365 struct buffer_head *group_bh = NULL; 2629 struct buffer_head *group_bh = NULL;
2366 u64 bg_blkno; 2630 u64 bg_blkno;
@@ -2369,17 +2633,20 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
2369 mlog_entry("blkno: %llu bit: %u\n", (unsigned long long)blkno, 2633 mlog_entry("blkno: %llu bit: %u\n", (unsigned long long)blkno,
2370 (unsigned int)bit); 2634 (unsigned int)bit);
2371 2635
2372 alloc_fe = (struct ocfs2_dinode *)alloc_bh->b_data; 2636 alloc_di = (struct ocfs2_dinode *)alloc_bh->b_data;
2373 if ((bit + 1) > ocfs2_bits_per_group(&alloc_fe->id2.i_chain)) { 2637 if ((bit + 1) > ocfs2_bits_per_group(&alloc_di->id2.i_chain)) {
2374 mlog(ML_ERROR, "suballoc bit %u out of range of %u\n", 2638 mlog(ML_ERROR, "suballoc bit %u out of range of %u\n",
2375 (unsigned int)bit, 2639 (unsigned int)bit,
2376 ocfs2_bits_per_group(&alloc_fe->id2.i_chain)); 2640 ocfs2_bits_per_group(&alloc_di->id2.i_chain));
2377 status = -EINVAL; 2641 status = -EINVAL;
2378 goto bail; 2642 goto bail;
2379 } 2643 }
2380 2644
2381 bg_blkno = ocfs2_which_suballoc_group(blkno, bit); 2645 if (alloc_di->i_suballoc_loc)
2382 status = ocfs2_read_group_descriptor(suballoc, alloc_fe, bg_blkno, 2646 bg_blkno = le64_to_cpu(alloc_di->i_suballoc_loc);
2647 else
2648 bg_blkno = ocfs2_which_suballoc_group(blkno, bit);
2649 status = ocfs2_read_group_descriptor(suballoc, alloc_di, bg_blkno,
2383 &group_bh); 2650 &group_bh);
2384 if (status < 0) { 2651 if (status < 0) {
2385 mlog(ML_ERROR, "read group %llu failed %d\n", 2652 mlog(ML_ERROR, "read group %llu failed %d\n",
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index fa60723c43e8..a017dd3ee7d9 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -26,13 +26,14 @@
26#ifndef _CHAINALLOC_H_ 26#ifndef _CHAINALLOC_H_
27#define _CHAINALLOC_H_ 27#define _CHAINALLOC_H_
28 28
29struct ocfs2_suballoc_result;
29typedef int (group_search_t)(struct inode *, 30typedef int (group_search_t)(struct inode *,
30 struct buffer_head *, 31 struct buffer_head *,
31 u32, /* bits_wanted */ 32 u32, /* bits_wanted */
32 u32, /* min_bits */ 33 u32, /* min_bits */
33 u64, /* max_block */ 34 u64, /* max_block */
34 u16 *, /* *bit_off */ 35 struct ocfs2_suballoc_result *);
35 u16 *); /* *bits_found */ 36 /* found bits */
36 37
37struct ocfs2_alloc_context { 38struct ocfs2_alloc_context {
38 struct inode *ac_inode; /* which bitmap are we allocating from? */ 39 struct inode *ac_inode; /* which bitmap are we allocating from? */
@@ -54,6 +55,8 @@ struct ocfs2_alloc_context {
54 u64 ac_last_group; 55 u64 ac_last_group;
55 u64 ac_max_block; /* Highest block number to allocate. 0 is 56 u64 ac_max_block; /* Highest block number to allocate. 0 is
56 is the same as ~0 - unlimited */ 57 is the same as ~0 - unlimited */
58
59 struct ocfs2_alloc_reservation *ac_resv;
57}; 60};
58 61
59void ocfs2_init_steal_slots(struct ocfs2_super *osb); 62void ocfs2_init_steal_slots(struct ocfs2_super *osb);
@@ -80,22 +83,21 @@ int ocfs2_reserve_clusters(struct ocfs2_super *osb,
80 u32 bits_wanted, 83 u32 bits_wanted,
81 struct ocfs2_alloc_context **ac); 84 struct ocfs2_alloc_context **ac);
82 85
83int ocfs2_claim_metadata(struct ocfs2_super *osb, 86int ocfs2_claim_metadata(handle_t *handle,
84 handle_t *handle,
85 struct ocfs2_alloc_context *ac, 87 struct ocfs2_alloc_context *ac,
86 u32 bits_wanted, 88 u32 bits_wanted,
89 u64 *suballoc_loc,
87 u16 *suballoc_bit_start, 90 u16 *suballoc_bit_start,
88 u32 *num_bits, 91 u32 *num_bits,
89 u64 *blkno_start); 92 u64 *blkno_start);
90int ocfs2_claim_new_inode(struct ocfs2_super *osb, 93int ocfs2_claim_new_inode(handle_t *handle,
91 handle_t *handle,
92 struct inode *dir, 94 struct inode *dir,
93 struct buffer_head *parent_fe_bh, 95 struct buffer_head *parent_fe_bh,
94 struct ocfs2_alloc_context *ac, 96 struct ocfs2_alloc_context *ac,
97 u64 *suballoc_loc,
95 u16 *suballoc_bit, 98 u16 *suballoc_bit,
96 u64 *fe_blkno); 99 u64 *fe_blkno);
97int ocfs2_claim_clusters(struct ocfs2_super *osb, 100int ocfs2_claim_clusters(handle_t *handle,
98 handle_t *handle,
99 struct ocfs2_alloc_context *ac, 101 struct ocfs2_alloc_context *ac,
100 u32 min_clusters, 102 u32 min_clusters,
101 u32 *cluster_start, 103 u32 *cluster_start,
@@ -104,8 +106,7 @@ int ocfs2_claim_clusters(struct ocfs2_super *osb,
104 * Use this variant of ocfs2_claim_clusters to specify a maxiumum 106 * Use this variant of ocfs2_claim_clusters to specify a maxiumum
105 * number of clusters smaller than the allocation reserved. 107 * number of clusters smaller than the allocation reserved.
106 */ 108 */
107int __ocfs2_claim_clusters(struct ocfs2_super *osb, 109int __ocfs2_claim_clusters(handle_t *handle,
108 handle_t *handle,
109 struct ocfs2_alloc_context *ac, 110 struct ocfs2_alloc_context *ac,
110 u32 min_clusters, 111 u32 min_clusters,
111 u32 max_clusters, 112 u32 max_clusters,
@@ -127,6 +128,11 @@ int ocfs2_free_clusters(handle_t *handle,
127 struct buffer_head *bitmap_bh, 128 struct buffer_head *bitmap_bh,
128 u64 start_blk, 129 u64 start_blk,
129 unsigned int num_clusters); 130 unsigned int num_clusters);
131int ocfs2_release_clusters(handle_t *handle,
132 struct inode *bitmap_inode,
133 struct buffer_head *bitmap_bh,
134 u64 start_blk,
135 unsigned int num_clusters);
130 136
131static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit) 137static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit)
132{ 138{
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index dee03197a494..0eaa929a4dbf 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -94,7 +94,9 @@ struct mount_options
94 unsigned long mount_opt; 94 unsigned long mount_opt;
95 unsigned int atime_quantum; 95 unsigned int atime_quantum;
96 signed short slot; 96 signed short slot;
97 unsigned int localalloc_opt; 97 int localalloc_opt;
98 unsigned int resv_level;
99 int dir_resv_level;
98 char cluster_stack[OCFS2_STACK_LABEL_LEN + 1]; 100 char cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
99}; 101};
100 102
@@ -176,6 +178,8 @@ enum {
176 Opt_noacl, 178 Opt_noacl,
177 Opt_usrquota, 179 Opt_usrquota,
178 Opt_grpquota, 180 Opt_grpquota,
181 Opt_resv_level,
182 Opt_dir_resv_level,
179 Opt_err, 183 Opt_err,
180}; 184};
181 185
@@ -202,6 +206,8 @@ static const match_table_t tokens = {
202 {Opt_noacl, "noacl"}, 206 {Opt_noacl, "noacl"},
203 {Opt_usrquota, "usrquota"}, 207 {Opt_usrquota, "usrquota"},
204 {Opt_grpquota, "grpquota"}, 208 {Opt_grpquota, "grpquota"},
209 {Opt_resv_level, "resv_level=%u"},
210 {Opt_dir_resv_level, "dir_resv_level=%u"},
205 {Opt_err, NULL} 211 {Opt_err, NULL}
206}; 212};
207 213
@@ -873,13 +879,15 @@ static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend)
873 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type])) 879 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
874 continue; 880 continue;
875 if (unsuspend) 881 if (unsuspend)
876 status = vfs_quota_enable( 882 status = dquot_resume(sb, type);
877 sb_dqopt(sb)->files[type], 883 else {
878 type, QFMT_OCFS2, 884 struct ocfs2_mem_dqinfo *oinfo;
879 DQUOT_SUSPENDED); 885
880 else 886 /* Cancel periodic syncing before suspending */
881 status = vfs_quota_disable(sb, type, 887 oinfo = sb_dqinfo(sb, type)->dqi_priv;
882 DQUOT_SUSPENDED); 888 cancel_delayed_work_sync(&oinfo->dqi_sync_work);
889 status = dquot_suspend(sb, type);
890 }
883 if (status < 0) 891 if (status < 0)
884 break; 892 break;
885 } 893 }
@@ -910,8 +918,8 @@ static int ocfs2_enable_quotas(struct ocfs2_super *osb)
910 status = -ENOENT; 918 status = -ENOENT;
911 goto out_quota_off; 919 goto out_quota_off;
912 } 920 }
913 status = vfs_quota_enable(inode[type], type, QFMT_OCFS2, 921 status = dquot_enable(inode[type], type, QFMT_OCFS2,
914 DQUOT_USAGE_ENABLED); 922 DQUOT_USAGE_ENABLED);
915 if (status < 0) 923 if (status < 0)
916 goto out_quota_off; 924 goto out_quota_off;
917 } 925 }
@@ -932,18 +940,22 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb)
932 int type; 940 int type;
933 struct inode *inode; 941 struct inode *inode;
934 struct super_block *sb = osb->sb; 942 struct super_block *sb = osb->sb;
943 struct ocfs2_mem_dqinfo *oinfo;
935 944
936 /* We mostly ignore errors in this function because there's not much 945 /* We mostly ignore errors in this function because there's not much
937 * we can do when we see them */ 946 * we can do when we see them */
938 for (type = 0; type < MAXQUOTAS; type++) { 947 for (type = 0; type < MAXQUOTAS; type++) {
939 if (!sb_has_quota_loaded(sb, type)) 948 if (!sb_has_quota_loaded(sb, type))
940 continue; 949 continue;
950 /* Cancel periodic syncing before we grab dqonoff_mutex */
951 oinfo = sb_dqinfo(sb, type)->dqi_priv;
952 cancel_delayed_work_sync(&oinfo->dqi_sync_work);
941 inode = igrab(sb->s_dquot.files[type]); 953 inode = igrab(sb->s_dquot.files[type]);
942 /* Turn off quotas. This will remove all dquot structures from 954 /* Turn off quotas. This will remove all dquot structures from
943 * memory and so they will be automatically synced to global 955 * memory and so they will be automatically synced to global
944 * quota files */ 956 * quota files */
945 vfs_quota_disable(sb, type, DQUOT_USAGE_ENABLED | 957 dquot_disable(sb, type, DQUOT_USAGE_ENABLED |
946 DQUOT_LIMITS_ENABLED); 958 DQUOT_LIMITS_ENABLED);
947 if (!inode) 959 if (!inode)
948 continue; 960 continue;
949 iput(inode); 961 iput(inode);
@@ -952,7 +964,7 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb)
952 964
953/* Handle quota on quotactl */ 965/* Handle quota on quotactl */
954static int ocfs2_quota_on(struct super_block *sb, int type, int format_id, 966static int ocfs2_quota_on(struct super_block *sb, int type, int format_id,
955 char *path, int remount) 967 char *path)
956{ 968{
957 unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, 969 unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
958 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; 970 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
@@ -960,30 +972,24 @@ static int ocfs2_quota_on(struct super_block *sb, int type, int format_id,
960 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type])) 972 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
961 return -EINVAL; 973 return -EINVAL;
962 974
963 if (remount) 975 return dquot_enable(sb_dqopt(sb)->files[type], type,
964 return 0; /* Just ignore it has been handled in 976 format_id, DQUOT_LIMITS_ENABLED);
965 * ocfs2_remount() */
966 return vfs_quota_enable(sb_dqopt(sb)->files[type], type,
967 format_id, DQUOT_LIMITS_ENABLED);
968} 977}
969 978
970/* Handle quota off quotactl */ 979/* Handle quota off quotactl */
971static int ocfs2_quota_off(struct super_block *sb, int type, int remount) 980static int ocfs2_quota_off(struct super_block *sb, int type)
972{ 981{
973 if (remount) 982 return dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
974 return 0; /* Ignore now and handle later in
975 * ocfs2_remount() */
976 return vfs_quota_disable(sb, type, DQUOT_LIMITS_ENABLED);
977} 983}
978 984
979static const struct quotactl_ops ocfs2_quotactl_ops = { 985static const struct quotactl_ops ocfs2_quotactl_ops = {
980 .quota_on = ocfs2_quota_on, 986 .quota_on = ocfs2_quota_on,
981 .quota_off = ocfs2_quota_off, 987 .quota_off = ocfs2_quota_off,
982 .quota_sync = vfs_quota_sync, 988 .quota_sync = dquot_quota_sync,
983 .get_info = vfs_get_dqinfo, 989 .get_info = dquot_get_dqinfo,
984 .set_info = vfs_set_dqinfo, 990 .set_info = dquot_set_dqinfo,
985 .get_dqblk = vfs_get_dqblk, 991 .get_dqblk = dquot_get_dqblk,
986 .set_dqblk = vfs_set_dqblk, 992 .set_dqblk = dquot_set_dqblk,
987}; 993};
988 994
989static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) 995static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
@@ -1028,8 +1034,14 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
1028 osb->s_atime_quantum = parsed_options.atime_quantum; 1034 osb->s_atime_quantum = parsed_options.atime_quantum;
1029 osb->preferred_slot = parsed_options.slot; 1035 osb->preferred_slot = parsed_options.slot;
1030 osb->osb_commit_interval = parsed_options.commit_interval; 1036 osb->osb_commit_interval = parsed_options.commit_interval;
1031 osb->local_alloc_default_bits = ocfs2_megabytes_to_clusters(sb, parsed_options.localalloc_opt); 1037
1032 osb->local_alloc_bits = osb->local_alloc_default_bits; 1038 ocfs2_la_set_sizes(osb, parsed_options.localalloc_opt);
1039 osb->osb_resv_level = parsed_options.resv_level;
1040 osb->osb_dir_resv_level = parsed_options.resv_level;
1041 if (parsed_options.dir_resv_level == -1)
1042 osb->osb_dir_resv_level = parsed_options.resv_level;
1043 else
1044 osb->osb_dir_resv_level = parsed_options.dir_resv_level;
1033 1045
1034 status = ocfs2_verify_userspace_stack(osb, &parsed_options); 1046 status = ocfs2_verify_userspace_stack(osb, &parsed_options);
1035 if (status) 1047 if (status)
@@ -1285,11 +1297,13 @@ static int ocfs2_parse_options(struct super_block *sb,
1285 options ? options : "(none)"); 1297 options ? options : "(none)");
1286 1298
1287 mopt->commit_interval = 0; 1299 mopt->commit_interval = 0;
1288 mopt->mount_opt = 0; 1300 mopt->mount_opt = OCFS2_MOUNT_NOINTR;
1289 mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM; 1301 mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
1290 mopt->slot = OCFS2_INVALID_SLOT; 1302 mopt->slot = OCFS2_INVALID_SLOT;
1291 mopt->localalloc_opt = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE; 1303 mopt->localalloc_opt = -1;
1292 mopt->cluster_stack[0] = '\0'; 1304 mopt->cluster_stack[0] = '\0';
1305 mopt->resv_level = OCFS2_DEFAULT_RESV_LEVEL;
1306 mopt->dir_resv_level = -1;
1293 1307
1294 if (!options) { 1308 if (!options) {
1295 status = 1; 1309 status = 1;
@@ -1380,7 +1394,7 @@ static int ocfs2_parse_options(struct super_block *sb,
1380 status = 0; 1394 status = 0;
1381 goto bail; 1395 goto bail;
1382 } 1396 }
1383 if (option >= 0 && (option <= ocfs2_local_alloc_size(sb) * 8)) 1397 if (option >= 0)
1384 mopt->localalloc_opt = option; 1398 mopt->localalloc_opt = option;
1385 break; 1399 break;
1386 case Opt_localflocks: 1400 case Opt_localflocks:
@@ -1433,6 +1447,28 @@ static int ocfs2_parse_options(struct super_block *sb,
1433 mopt->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL; 1447 mopt->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL;
1434 mopt->mount_opt &= ~OCFS2_MOUNT_POSIX_ACL; 1448 mopt->mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
1435 break; 1449 break;
1450 case Opt_resv_level:
1451 if (is_remount)
1452 break;
1453 if (match_int(&args[0], &option)) {
1454 status = 0;
1455 goto bail;
1456 }
1457 if (option >= OCFS2_MIN_RESV_LEVEL &&
1458 option < OCFS2_MAX_RESV_LEVEL)
1459 mopt->resv_level = option;
1460 break;
1461 case Opt_dir_resv_level:
1462 if (is_remount)
1463 break;
1464 if (match_int(&args[0], &option)) {
1465 status = 0;
1466 goto bail;
1467 }
1468 if (option >= OCFS2_MIN_RESV_LEVEL &&
1469 option < OCFS2_MAX_RESV_LEVEL)
1470 mopt->dir_resv_level = option;
1471 break;
1436 default: 1472 default:
1437 mlog(ML_ERROR, 1473 mlog(ML_ERROR,
1438 "Unrecognized mount option \"%s\" " 1474 "Unrecognized mount option \"%s\" "
@@ -1487,7 +1523,7 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
1487 (unsigned) (osb->osb_commit_interval / HZ)); 1523 (unsigned) (osb->osb_commit_interval / HZ));
1488 1524
1489 local_alloc_megs = osb->local_alloc_bits >> (20 - osb->s_clustersize_bits); 1525 local_alloc_megs = osb->local_alloc_bits >> (20 - osb->s_clustersize_bits);
1490 if (local_alloc_megs != OCFS2_DEFAULT_LOCAL_ALLOC_SIZE) 1526 if (local_alloc_megs != ocfs2_la_default_mb(osb))
1491 seq_printf(s, ",localalloc=%d", local_alloc_megs); 1527 seq_printf(s, ",localalloc=%d", local_alloc_megs);
1492 1528
1493 if (opts & OCFS2_MOUNT_LOCALFLOCKS) 1529 if (opts & OCFS2_MOUNT_LOCALFLOCKS)
@@ -1514,6 +1550,12 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
1514 else 1550 else
1515 seq_printf(s, ",noacl"); 1551 seq_printf(s, ",noacl");
1516 1552
1553 if (osb->osb_resv_level != OCFS2_DEFAULT_RESV_LEVEL)
1554 seq_printf(s, ",resv_level=%d", osb->osb_resv_level);
1555
1556 if (osb->osb_dir_resv_level != osb->osb_resv_level)
1557 seq_printf(s, ",dir_resv_level=%d", osb->osb_resv_level);
1558
1517 return 0; 1559 return 0;
1518} 1560}
1519 1561
@@ -1688,6 +1730,8 @@ static void ocfs2_inode_init_once(void *data)
1688 oi->ip_blkno = 0ULL; 1730 oi->ip_blkno = 0ULL;
1689 oi->ip_clusters = 0; 1731 oi->ip_clusters = 0;
1690 1732
1733 ocfs2_resv_init_once(&oi->ip_la_data_resv);
1734
1691 ocfs2_lock_res_init_once(&oi->ip_rw_lockres); 1735 ocfs2_lock_res_init_once(&oi->ip_rw_lockres);
1692 ocfs2_lock_res_init_once(&oi->ip_inode_lockres); 1736 ocfs2_lock_res_init_once(&oi->ip_inode_lockres);
1693 ocfs2_lock_res_init_once(&oi->ip_open_lockres); 1737 ocfs2_lock_res_init_once(&oi->ip_open_lockres);
@@ -2042,6 +2086,12 @@ static int ocfs2_initialize_super(struct super_block *sb,
2042 2086
2043 init_waitqueue_head(&osb->osb_mount_event); 2087 init_waitqueue_head(&osb->osb_mount_event);
2044 2088
2089 status = ocfs2_resmap_init(osb, &osb->osb_la_resmap);
2090 if (status) {
2091 mlog_errno(status);
2092 goto bail;
2093 }
2094
2045 osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL); 2095 osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL);
2046 if (!osb->vol_label) { 2096 if (!osb->vol_label) {
2047 mlog(ML_ERROR, "unable to alloc vol label\n"); 2097 mlog(ML_ERROR, "unable to alloc vol label\n");
@@ -2224,9 +2274,11 @@ static int ocfs2_initialize_super(struct super_block *sb,
2224 } 2274 }
2225 2275
2226 osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno; 2276 osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno;
2277 osb->osb_clusters_at_boot = OCFS2_I(inode)->ip_clusters;
2227 iput(inode); 2278 iput(inode);
2228 2279
2229 osb->bitmap_cpg = ocfs2_group_bitmap_size(sb) * 8; 2280 osb->bitmap_cpg = ocfs2_group_bitmap_size(sb, 0,
2281 osb->s_feature_incompat) * 8;
2230 2282
2231 status = ocfs2_init_slot_info(osb); 2283 status = ocfs2_init_slot_info(osb);
2232 if (status < 0) { 2284 if (status < 0) {
@@ -2509,5 +2561,25 @@ void __ocfs2_abort(struct super_block* sb,
2509 ocfs2_handle_error(sb); 2561 ocfs2_handle_error(sb);
2510} 2562}
2511 2563
2564/*
2565 * Void signal blockers, because in-kernel sigprocmask() only fails
2566 * when SIG_* is wrong.
2567 */
2568void ocfs2_block_signals(sigset_t *oldset)
2569{
2570 int rc;
2571 sigset_t blocked;
2572
2573 sigfillset(&blocked);
2574 rc = sigprocmask(SIG_BLOCK, &blocked, oldset);
2575 BUG_ON(rc);
2576}
2577
2578void ocfs2_unblock_signals(sigset_t *oldset)
2579{
2580 int rc = sigprocmask(SIG_SETMASK, oldset, NULL);
2581 BUG_ON(rc);
2582}
2583
2512module_init(ocfs2_init); 2584module_init(ocfs2_init);
2513module_exit(ocfs2_exit); 2585module_exit(ocfs2_exit);
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h
index 783f5270f2a1..40c7de084c10 100644
--- a/fs/ocfs2/super.h
+++ b/fs/ocfs2/super.h
@@ -45,4 +45,11 @@ void __ocfs2_abort(struct super_block *sb,
45 45
46#define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args) 46#define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args)
47 47
48/*
49 * Void signal blockers, because in-kernel sigprocmask() only fails
50 * when SIG_* is wrong.
51 */
52void ocfs2_block_signals(sigset_t *oldset);
53void ocfs2_unblock_signals(sigset_t *oldset);
54
48#endif /* OCFS2_SUPER_H */ 55#endif /* OCFS2_SUPER_H */
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
index 40e53702948c..bfe7190cdbf1 100644
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c
@@ -25,7 +25,6 @@
25 25
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/types.h> 27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h> 28#include <linux/highmem.h>
30 29
31#define MLOG_MASK_PREFIX ML_INODE 30#define MLOG_MASK_PREFIX ML_INODE
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index d1b0d386f6d1..e97b34842cfe 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -79,6 +79,7 @@ struct ocfs2_xattr_set_ctxt {
79 struct ocfs2_alloc_context *meta_ac; 79 struct ocfs2_alloc_context *meta_ac;
80 struct ocfs2_alloc_context *data_ac; 80 struct ocfs2_alloc_context *data_ac;
81 struct ocfs2_cached_dealloc_ctxt dealloc; 81 struct ocfs2_cached_dealloc_ctxt dealloc;
82 int set_abort;
82}; 83};
83 84
84#define OCFS2_XATTR_ROOT_SIZE (sizeof(struct ocfs2_xattr_def_value_root)) 85#define OCFS2_XATTR_ROOT_SIZE (sizeof(struct ocfs2_xattr_def_value_root))
@@ -96,7 +97,7 @@ static struct ocfs2_xattr_def_value_root def_xv = {
96 .xv.xr_list.l_count = cpu_to_le16(1), 97 .xv.xr_list.l_count = cpu_to_le16(1),
97}; 98};
98 99
99struct xattr_handler *ocfs2_xattr_handlers[] = { 100const struct xattr_handler *ocfs2_xattr_handlers[] = {
100 &ocfs2_xattr_user_handler, 101 &ocfs2_xattr_user_handler,
101 &ocfs2_xattr_acl_access_handler, 102 &ocfs2_xattr_acl_access_handler,
102 &ocfs2_xattr_acl_default_handler, 103 &ocfs2_xattr_acl_default_handler,
@@ -105,7 +106,7 @@ struct xattr_handler *ocfs2_xattr_handlers[] = {
105 NULL 106 NULL
106}; 107};
107 108
108static struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = { 109static const struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
109 [OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler, 110 [OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler,
110 [OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS] 111 [OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS]
111 = &ocfs2_xattr_acl_access_handler, 112 = &ocfs2_xattr_acl_access_handler,
@@ -539,7 +540,7 @@ static int ocfs2_read_xattr_block(struct inode *inode, u64 xb_blkno,
539 540
540static inline const char *ocfs2_xattr_prefix(int name_index) 541static inline const char *ocfs2_xattr_prefix(int name_index)
541{ 542{
542 struct xattr_handler *handler = NULL; 543 const struct xattr_handler *handler = NULL;
543 544
544 if (name_index > 0 && name_index < OCFS2_XATTR_MAX) 545 if (name_index > 0 && name_index < OCFS2_XATTR_MAX)
545 handler = ocfs2_xattr_handler_map[name_index]; 546 handler = ocfs2_xattr_handler_map[name_index];
@@ -739,11 +740,7 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode,
739 goto leave; 740 goto leave;
740 } 741 }
741 742
742 status = ocfs2_journal_dirty(handle, vb->vb_bh); 743 ocfs2_journal_dirty(handle, vb->vb_bh);
743 if (status < 0) {
744 mlog_errno(status);
745 goto leave;
746 }
747 744
748 clusters_to_add -= le32_to_cpu(vb->vb_xv->xr_clusters) - prev_clusters; 745 clusters_to_add -= le32_to_cpu(vb->vb_xv->xr_clusters) - prev_clusters;
749 746
@@ -786,12 +783,7 @@ static int __ocfs2_remove_xattr_range(struct inode *inode,
786 } 783 }
787 784
788 le32_add_cpu(&vb->vb_xv->xr_clusters, -len); 785 le32_add_cpu(&vb->vb_xv->xr_clusters, -len);
789 786 ocfs2_journal_dirty(handle, vb->vb_bh);
790 ret = ocfs2_journal_dirty(handle, vb->vb_bh);
791 if (ret) {
792 mlog_errno(ret);
793 goto out;
794 }
795 787
796 if (ext_flags & OCFS2_EXT_REFCOUNTED) 788 if (ext_flags & OCFS2_EXT_REFCOUNTED)
797 ret = ocfs2_decrease_refcount(inode, handle, 789 ret = ocfs2_decrease_refcount(inode, handle,
@@ -1374,11 +1366,7 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode,
1374 memset(bh->b_data + cp_len, 0, 1366 memset(bh->b_data + cp_len, 0,
1375 blocksize - cp_len); 1367 blocksize - cp_len);
1376 1368
1377 ret = ocfs2_journal_dirty(handle, bh); 1369 ocfs2_journal_dirty(handle, bh);
1378 if (ret < 0) {
1379 mlog_errno(ret);
1380 goto out;
1381 }
1382 brelse(bh); 1370 brelse(bh);
1383 bh = NULL; 1371 bh = NULL;
1384 1372
@@ -1622,7 +1610,7 @@ static void ocfs2_xa_block_wipe_namevalue(struct ocfs2_xa_loc *loc)
1622 /* Now tell xh->xh_entries about it */ 1610 /* Now tell xh->xh_entries about it */
1623 for (i = 0; i < count; i++) { 1611 for (i = 0; i < count; i++) {
1624 offset = le16_to_cpu(xh->xh_entries[i].xe_name_offset); 1612 offset = le16_to_cpu(xh->xh_entries[i].xe_name_offset);
1625 if (offset < namevalue_offset) 1613 if (offset <= namevalue_offset)
1626 le16_add_cpu(&xh->xh_entries[i].xe_name_offset, 1614 le16_add_cpu(&xh->xh_entries[i].xe_name_offset,
1627 namevalue_size); 1615 namevalue_size);
1628 } 1616 }
@@ -2148,15 +2136,19 @@ alloc_value:
2148 orig_clusters = ocfs2_xa_value_clusters(loc); 2136 orig_clusters = ocfs2_xa_value_clusters(loc);
2149 rc = ocfs2_xa_value_truncate(loc, xi->xi_value_len, ctxt); 2137 rc = ocfs2_xa_value_truncate(loc, xi->xi_value_len, ctxt);
2150 if (rc < 0) { 2138 if (rc < 0) {
2151 /* 2139 ctxt->set_abort = 1;
2152 * If we tried to grow an existing external value,
2153 * ocfs2_xa_cleanuP-value_truncate() is going to
2154 * let it stand. We have to restore its original
2155 * value size.
2156 */
2157 loc->xl_entry->xe_value_size = orig_value_size;
2158 ocfs2_xa_cleanup_value_truncate(loc, "growing", 2140 ocfs2_xa_cleanup_value_truncate(loc, "growing",
2159 orig_clusters); 2141 orig_clusters);
2142 /*
2143 * If we were growing an existing value,
2144 * ocfs2_xa_cleanup_value_truncate() won't remove
2145 * the entry. We need to restore the original value
2146 * size.
2147 */
2148 if (loc->xl_entry) {
2149 BUG_ON(!orig_value_size);
2150 loc->xl_entry->xe_value_size = orig_value_size;
2151 }
2160 mlog_errno(rc); 2152 mlog_errno(rc);
2161 } 2153 }
2162 } 2154 }
@@ -2479,7 +2471,10 @@ static int ocfs2_xattr_free_block(struct inode *inode,
2479 xb = (struct ocfs2_xattr_block *)blk_bh->b_data; 2471 xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
2480 blk = le64_to_cpu(xb->xb_blkno); 2472 blk = le64_to_cpu(xb->xb_blkno);
2481 bit = le16_to_cpu(xb->xb_suballoc_bit); 2473 bit = le16_to_cpu(xb->xb_suballoc_bit);
2482 bg_blkno = ocfs2_which_suballoc_group(blk, bit); 2474 if (xb->xb_suballoc_loc)
2475 bg_blkno = le64_to_cpu(xb->xb_suballoc_loc);
2476 else
2477 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
2483 2478
2484 xb_alloc_inode = ocfs2_get_system_file_inode(osb, 2479 xb_alloc_inode = ocfs2_get_system_file_inode(osb,
2485 EXTENT_ALLOC_SYSTEM_INODE, 2480 EXTENT_ALLOC_SYSTEM_INODE,
@@ -2594,9 +2589,7 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
2594 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); 2589 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
2595 spin_unlock(&oi->ip_lock); 2590 spin_unlock(&oi->ip_lock);
2596 2591
2597 ret = ocfs2_journal_dirty(handle, di_bh); 2592 ocfs2_journal_dirty(handle, di_bh);
2598 if (ret < 0)
2599 mlog_errno(ret);
2600out_commit: 2593out_commit:
2601 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); 2594 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
2602out: 2595out:
@@ -2724,9 +2717,7 @@ static int ocfs2_xattr_ibody_init(struct inode *inode,
2724 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); 2717 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
2725 spin_unlock(&oi->ip_lock); 2718 spin_unlock(&oi->ip_lock);
2726 2719
2727 ret = ocfs2_journal_dirty(ctxt->handle, di_bh); 2720 ocfs2_journal_dirty(ctxt->handle, di_bh);
2728 if (ret < 0)
2729 mlog_errno(ret);
2730 2721
2731out: 2722out:
2732 return ret; 2723 return ret;
@@ -2846,9 +2837,8 @@ static int ocfs2_create_xattr_block(struct inode *inode,
2846 int ret; 2837 int ret;
2847 u16 suballoc_bit_start; 2838 u16 suballoc_bit_start;
2848 u32 num_got; 2839 u32 num_got;
2849 u64 first_blkno; 2840 u64 suballoc_loc, first_blkno;
2850 struct ocfs2_dinode *di = (struct ocfs2_dinode *)inode_bh->b_data; 2841 struct ocfs2_dinode *di = (struct ocfs2_dinode *)inode_bh->b_data;
2851 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2852 struct buffer_head *new_bh = NULL; 2842 struct buffer_head *new_bh = NULL;
2853 struct ocfs2_xattr_block *xblk; 2843 struct ocfs2_xattr_block *xblk;
2854 2844
@@ -2859,9 +2849,9 @@ static int ocfs2_create_xattr_block(struct inode *inode,
2859 goto end; 2849 goto end;
2860 } 2850 }
2861 2851
2862 ret = ocfs2_claim_metadata(osb, ctxt->handle, ctxt->meta_ac, 1, 2852 ret = ocfs2_claim_metadata(ctxt->handle, ctxt->meta_ac, 1,
2863 &suballoc_bit_start, &num_got, 2853 &suballoc_loc, &suballoc_bit_start,
2864 &first_blkno); 2854 &num_got, &first_blkno);
2865 if (ret < 0) { 2855 if (ret < 0) {
2866 mlog_errno(ret); 2856 mlog_errno(ret);
2867 goto end; 2857 goto end;
@@ -2883,8 +2873,10 @@ static int ocfs2_create_xattr_block(struct inode *inode,
2883 memset(xblk, 0, inode->i_sb->s_blocksize); 2873 memset(xblk, 0, inode->i_sb->s_blocksize);
2884 strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE); 2874 strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE);
2885 xblk->xb_suballoc_slot = cpu_to_le16(ctxt->meta_ac->ac_alloc_slot); 2875 xblk->xb_suballoc_slot = cpu_to_le16(ctxt->meta_ac->ac_alloc_slot);
2876 xblk->xb_suballoc_loc = cpu_to_le64(suballoc_loc);
2886 xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start); 2877 xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start);
2887 xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation); 2878 xblk->xb_fs_generation =
2879 cpu_to_le32(OCFS2_SB(inode->i_sb)->fs_generation);
2888 xblk->xb_blkno = cpu_to_le64(first_blkno); 2880 xblk->xb_blkno = cpu_to_le64(first_blkno);
2889 if (indexed) { 2881 if (indexed) {
2890 struct ocfs2_xattr_tree_root *xr = &xblk->xb_attrs.xb_root; 2882 struct ocfs2_xattr_tree_root *xr = &xblk->xb_attrs.xb_root;
@@ -2956,7 +2948,7 @@ static int ocfs2_xattr_block_set(struct inode *inode,
2956 ret = ocfs2_xa_set(&loc, xi, ctxt); 2948 ret = ocfs2_xa_set(&loc, xi, ctxt);
2957 if (!ret) 2949 if (!ret)
2958 xs->here = loc.xl_entry; 2950 xs->here = loc.xl_entry;
2959 else if (ret != -ENOSPC) 2951 else if ((ret != -ENOSPC) || ctxt->set_abort)
2960 goto end; 2952 goto end;
2961 else { 2953 else {
2962 ret = ocfs2_xattr_create_index_block(inode, xs, ctxt); 2954 ret = ocfs2_xattr_create_index_block(inode, xs, ctxt);
@@ -3312,14 +3304,13 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
3312 goto out; 3304 goto out;
3313 } 3305 }
3314 3306
3315 ret = ocfs2_extend_trans(ctxt->handle, credits + 3307 ret = ocfs2_extend_trans(ctxt->handle, credits);
3316 ctxt->handle->h_buffer_credits);
3317 if (ret) { 3308 if (ret) {
3318 mlog_errno(ret); 3309 mlog_errno(ret);
3319 goto out; 3310 goto out;
3320 } 3311 }
3321 ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt); 3312 ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt);
3322 } else if (ret == -ENOSPC) { 3313 } else if ((ret == -ENOSPC) && !ctxt->set_abort) {
3323 if (di->i_xattr_loc && !xbs->xattr_bh) { 3314 if (di->i_xattr_loc && !xbs->xattr_bh) {
3324 ret = ocfs2_xattr_block_find(inode, 3315 ret = ocfs2_xattr_block_find(inode,
3325 xi->xi_name_index, 3316 xi->xi_name_index,
@@ -3343,8 +3334,7 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
3343 goto out; 3334 goto out;
3344 } 3335 }
3345 3336
3346 ret = ocfs2_extend_trans(ctxt->handle, credits + 3337 ret = ocfs2_extend_trans(ctxt->handle, credits);
3347 ctxt->handle->h_buffer_credits);
3348 if (ret) { 3338 if (ret) {
3349 mlog_errno(ret); 3339 mlog_errno(ret);
3350 goto out; 3340 goto out;
@@ -3378,8 +3368,7 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
3378 goto out; 3368 goto out;
3379 } 3369 }
3380 3370
3381 ret = ocfs2_extend_trans(ctxt->handle, credits + 3371 ret = ocfs2_extend_trans(ctxt->handle, credits);
3382 ctxt->handle->h_buffer_credits);
3383 if (ret) { 3372 if (ret) {
3384 mlog_errno(ret); 3373 mlog_errno(ret);
3385 goto out; 3374 goto out;
@@ -4249,7 +4238,6 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
4249 u32 bit_off, len; 4238 u32 bit_off, len;
4250 u64 blkno; 4239 u64 blkno;
4251 handle_t *handle = ctxt->handle; 4240 handle_t *handle = ctxt->handle;
4252 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
4253 struct ocfs2_inode_info *oi = OCFS2_I(inode); 4241 struct ocfs2_inode_info *oi = OCFS2_I(inode);
4254 struct buffer_head *xb_bh = xs->xattr_bh; 4242 struct buffer_head *xb_bh = xs->xattr_bh;
4255 struct ocfs2_xattr_block *xb = 4243 struct ocfs2_xattr_block *xb =
@@ -4277,7 +4265,7 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
4277 goto out; 4265 goto out;
4278 } 4266 }
4279 4267
4280 ret = __ocfs2_claim_clusters(osb, handle, ctxt->data_ac, 4268 ret = __ocfs2_claim_clusters(handle, ctxt->data_ac,
4281 1, 1, &bit_off, &len); 4269 1, 1, &bit_off, &len);
4282 if (ret) { 4270 if (ret) {
4283 mlog_errno(ret); 4271 mlog_errno(ret);
@@ -4887,8 +4875,7 @@ static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle,
4887 * We need to update the first bucket of the old extent and all 4875 * We need to update the first bucket of the old extent and all
4888 * the buckets going to the new extent. 4876 * the buckets going to the new extent.
4889 */ 4877 */
4890 credits = ((num_buckets + 1) * blks_per_bucket) + 4878 credits = ((num_buckets + 1) * blks_per_bucket);
4891 handle->h_buffer_credits;
4892 ret = ocfs2_extend_trans(handle, credits); 4879 ret = ocfs2_extend_trans(handle, credits);
4893 if (ret) { 4880 if (ret) {
4894 mlog_errno(ret); 4881 mlog_errno(ret);
@@ -4958,7 +4945,7 @@ static int ocfs2_divide_xattr_cluster(struct inode *inode,
4958 u32 *first_hash) 4945 u32 *first_hash)
4959{ 4946{
4960 u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); 4947 u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
4961 int ret, credits = 2 * blk_per_bucket + handle->h_buffer_credits; 4948 int ret, credits = 2 * blk_per_bucket;
4962 4949
4963 BUG_ON(OCFS2_XATTR_BUCKET_SIZE < OCFS2_SB(inode->i_sb)->s_clustersize); 4950 BUG_ON(OCFS2_XATTR_BUCKET_SIZE < OCFS2_SB(inode->i_sb)->s_clustersize);
4964 4951
@@ -5099,7 +5086,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
5099 goto leave; 5086 goto leave;
5100 } 5087 }
5101 5088
5102 ret = __ocfs2_claim_clusters(osb, handle, ctxt->data_ac, 1, 5089 ret = __ocfs2_claim_clusters(handle, ctxt->data_ac, 1,
5103 clusters_to_add, &bit_off, &num_bits); 5090 clusters_to_add, &bit_off, &num_bits);
5104 if (ret < 0) { 5091 if (ret < 0) {
5105 if (ret != -ENOSPC) 5092 if (ret != -ENOSPC)
@@ -5153,9 +5140,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
5153 goto leave; 5140 goto leave;
5154 } 5141 }
5155 5142
5156 ret = ocfs2_journal_dirty(handle, root_bh); 5143 ocfs2_journal_dirty(handle, root_bh);
5157 if (ret < 0)
5158 mlog_errno(ret);
5159 5144
5160leave: 5145leave:
5161 return ret; 5146 return ret;
@@ -5200,8 +5185,7 @@ static int ocfs2_extend_xattr_bucket(struct inode *inode,
5200 * existing bucket. Then we add the last existing bucket, the 5185 * existing bucket. Then we add the last existing bucket, the
5201 * new bucket, and the first bucket (3 * blk_per_bucket). 5186 * new bucket, and the first bucket (3 * blk_per_bucket).
5202 */ 5187 */
5203 credits = (end_blk - target_blk) + (3 * blk_per_bucket) + 5188 credits = (end_blk - target_blk) + (3 * blk_per_bucket);
5204 handle->h_buffer_credits;
5205 ret = ocfs2_extend_trans(handle, credits); 5189 ret = ocfs2_extend_trans(handle, credits);
5206 if (ret) { 5190 if (ret) {
5207 mlog_errno(ret); 5191 mlog_errno(ret);
@@ -5477,12 +5461,7 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
5477 } 5461 }
5478 5462
5479 le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, -len); 5463 le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, -len);
5480 5464 ocfs2_journal_dirty(handle, root_bh);
5481 ret = ocfs2_journal_dirty(handle, root_bh);
5482 if (ret) {
5483 mlog_errno(ret);
5484 goto out_commit;
5485 }
5486 5465
5487 ret = ocfs2_truncate_log_append(osb, handle, blkno, len); 5466 ret = ocfs2_truncate_log_append(osb, handle, blkno, len);
5488 if (ret) 5467 if (ret)
@@ -6528,13 +6507,11 @@ static int ocfs2_create_empty_xattr_block(struct inode *inode,
6528 int indexed) 6507 int indexed)
6529{ 6508{
6530 int ret; 6509 int ret;
6531 struct ocfs2_alloc_context *meta_ac;
6532 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 6510 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
6533 struct ocfs2_xattr_set_ctxt ctxt = { 6511 struct ocfs2_xattr_set_ctxt ctxt;
6534 .meta_ac = meta_ac,
6535 };
6536 6512
6537 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac); 6513 memset(&ctxt, 0, sizeof(ctxt));
6514 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &ctxt.meta_ac);
6538 if (ret < 0) { 6515 if (ret < 0) {
6539 mlog_errno(ret); 6516 mlog_errno(ret);
6540 return ret; 6517 return ret;
@@ -6556,7 +6533,7 @@ static int ocfs2_create_empty_xattr_block(struct inode *inode,
6556 6533
6557 ocfs2_commit_trans(osb, ctxt.handle); 6534 ocfs2_commit_trans(osb, ctxt.handle);
6558out: 6535out:
6559 ocfs2_free_alloc_context(meta_ac); 6536 ocfs2_free_alloc_context(ctxt.meta_ac);
6560 return ret; 6537 return ret;
6561} 6538}
6562 6539
@@ -6937,7 +6914,7 @@ static int ocfs2_reflink_xattr_rec(struct inode *inode,
6937 goto out; 6914 goto out;
6938 } 6915 }
6939 6916
6940 ret = ocfs2_claim_clusters(osb, handle, data_ac, 6917 ret = ocfs2_claim_clusters(handle, data_ac,
6941 len, &p_cluster, &num_clusters); 6918 len, &p_cluster, &num_clusters);
6942 if (ret) { 6919 if (ret) {
6943 mlog_errno(ret); 6920 mlog_errno(ret);
@@ -7236,7 +7213,7 @@ int ocfs2_init_security_set(handle_t *handle,
7236 xattr_ac, data_ac); 7213 xattr_ac, data_ac);
7237} 7214}
7238 7215
7239struct xattr_handler ocfs2_xattr_security_handler = { 7216const struct xattr_handler ocfs2_xattr_security_handler = {
7240 .prefix = XATTR_SECURITY_PREFIX, 7217 .prefix = XATTR_SECURITY_PREFIX,
7241 .list = ocfs2_xattr_security_list, 7218 .list = ocfs2_xattr_security_list,
7242 .get = ocfs2_xattr_security_get, 7219 .get = ocfs2_xattr_security_get,
@@ -7280,7 +7257,7 @@ static int ocfs2_xattr_trusted_set(struct dentry *dentry, const char *name,
7280 name, value, size, flags); 7257 name, value, size, flags);
7281} 7258}
7282 7259
7283struct xattr_handler ocfs2_xattr_trusted_handler = { 7260const struct xattr_handler ocfs2_xattr_trusted_handler = {
7284 .prefix = XATTR_TRUSTED_PREFIX, 7261 .prefix = XATTR_TRUSTED_PREFIX,
7285 .list = ocfs2_xattr_trusted_list, 7262 .list = ocfs2_xattr_trusted_list,
7286 .get = ocfs2_xattr_trusted_get, 7263 .get = ocfs2_xattr_trusted_get,
@@ -7336,7 +7313,7 @@ static int ocfs2_xattr_user_set(struct dentry *dentry, const char *name,
7336 name, value, size, flags); 7313 name, value, size, flags);
7337} 7314}
7338 7315
7339struct xattr_handler ocfs2_xattr_user_handler = { 7316const struct xattr_handler ocfs2_xattr_user_handler = {
7340 .prefix = XATTR_USER_PREFIX, 7317 .prefix = XATTR_USER_PREFIX,
7341 .list = ocfs2_xattr_user_list, 7318 .list = ocfs2_xattr_user_list,
7342 .get = ocfs2_xattr_user_get, 7319 .get = ocfs2_xattr_user_get,
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index abd72a47f520..aa64bb37a65b 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -37,12 +37,12 @@ struct ocfs2_security_xattr_info {
37 size_t value_len; 37 size_t value_len;
38}; 38};
39 39
40extern struct xattr_handler ocfs2_xattr_user_handler; 40extern const struct xattr_handler ocfs2_xattr_user_handler;
41extern struct xattr_handler ocfs2_xattr_trusted_handler; 41extern const struct xattr_handler ocfs2_xattr_trusted_handler;
42extern struct xattr_handler ocfs2_xattr_security_handler; 42extern const struct xattr_handler ocfs2_xattr_security_handler;
43extern struct xattr_handler ocfs2_xattr_acl_access_handler; 43extern const struct xattr_handler ocfs2_xattr_acl_access_handler;
44extern struct xattr_handler ocfs2_xattr_acl_default_handler; 44extern const struct xattr_handler ocfs2_xattr_acl_default_handler;
45extern struct xattr_handler *ocfs2_xattr_handlers[]; 45extern const struct xattr_handler *ocfs2_xattr_handlers[];
46 46
47ssize_t ocfs2_listxattr(struct dentry *, char *, size_t); 47ssize_t ocfs2_listxattr(struct dentry *, char *, size_t);
48int ocfs2_xattr_get_nolock(struct inode *, struct buffer_head *, int, 48int ocfs2_xattr_get_nolock(struct inode *, struct buffer_head *, int,
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 399487c09364..6e7a3291bbe8 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -329,7 +329,7 @@ const struct file_operations omfs_file_operations = {
329 .aio_read = generic_file_aio_read, 329 .aio_read = generic_file_aio_read,
330 .aio_write = generic_file_aio_write, 330 .aio_write = generic_file_aio_write,
331 .mmap = generic_file_mmap, 331 .mmap = generic_file_mmap,
332 .fsync = simple_fsync, 332 .fsync = generic_file_fsync,
333 .splice_read = generic_file_splice_read, 333 .splice_read = generic_file_splice_read,
334}; 334};
335 335
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index 75d9b5ba1d45..089839a6cc64 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -3,9 +3,9 @@
3 * Copyright (C) 2006 Bob Copeland <me@bobcopeland.com> 3 * Copyright (C) 2006 Bob Copeland <me@bobcopeland.com>
4 * Released under GPL v2. 4 * Released under GPL v2.
5 */ 5 */
6#include <linux/version.h>
7#include <linux/module.h> 6#include <linux/module.h>
8#include <linux/sched.h> 7#include <linux/sched.h>
8#include <linux/slab.h>
9#include <linux/fs.h> 9#include <linux/fs.h>
10#include <linux/vfs.h> 10#include <linux/vfs.h>
11#include <linux/parser.h> 11#include <linux/parser.h>
@@ -37,9 +37,7 @@ struct inode *omfs_new_inode(struct inode *dir, int mode)
37 goto fail; 37 goto fail;
38 38
39 inode->i_ino = new_block; 39 inode->i_ino = new_block;
40 inode->i_mode = mode; 40 inode_init_owner(inode, NULL, mode);
41 inode->i_uid = current_fsuid();
42 inode->i_gid = current_fsgid();
43 inode->i_mapping->a_ops = &omfs_aops; 41 inode->i_mapping->a_ops = &omfs_aops;
44 42
45 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 43 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/open.c b/fs/open.c
index e17f54454b50..5463266db9e6 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -10,7 +10,6 @@
10#include <linux/fdtable.h> 10#include <linux/fdtable.h>
11#include <linux/fsnotify.h> 11#include <linux/fsnotify.h>
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/slab.h>
14#include <linux/tty.h> 13#include <linux/tty.h>
15#include <linux/namei.h> 14#include <linux/namei.h>
16#include <linux/backing-dev.h> 15#include <linux/backing-dev.h>
@@ -18,8 +17,8 @@
18#include <linux/securebits.h> 17#include <linux/securebits.h>
19#include <linux/security.h> 18#include <linux/security.h>
20#include <linux/mount.h> 19#include <linux/mount.h>
21#include <linux/vfs.h>
22#include <linux/fcntl.h> 20#include <linux/fcntl.h>
21#include <linux/slab.h>
23#include <asm/uaccess.h> 22#include <asm/uaccess.h>
24#include <linux/fs.h> 23#include <linux/fs.h>
25#include <linux/personality.h> 24#include <linux/personality.h>
@@ -33,171 +32,6 @@
33 32
34#include "internal.h" 33#include "internal.h"
35 34
36int vfs_statfs(struct dentry *dentry, struct kstatfs *buf)
37{
38 int retval = -ENODEV;
39
40 if (dentry) {
41 retval = -ENOSYS;
42 if (dentry->d_sb->s_op->statfs) {
43 memset(buf, 0, sizeof(*buf));
44 retval = security_sb_statfs(dentry);
45 if (retval)
46 return retval;
47 retval = dentry->d_sb->s_op->statfs(dentry, buf);
48 if (retval == 0 && buf->f_frsize == 0)
49 buf->f_frsize = buf->f_bsize;
50 }
51 }
52 return retval;
53}
54
55EXPORT_SYMBOL(vfs_statfs);
56
57static int vfs_statfs_native(struct dentry *dentry, struct statfs *buf)
58{
59 struct kstatfs st;
60 int retval;
61
62 retval = vfs_statfs(dentry, &st);
63 if (retval)
64 return retval;
65
66 if (sizeof(*buf) == sizeof(st))
67 memcpy(buf, &st, sizeof(st));
68 else {
69 if (sizeof buf->f_blocks == 4) {
70 if ((st.f_blocks | st.f_bfree | st.f_bavail |
71 st.f_bsize | st.f_frsize) &
72 0xffffffff00000000ULL)
73 return -EOVERFLOW;
74 /*
75 * f_files and f_ffree may be -1; it's okay to stuff
76 * that into 32 bits
77 */
78 if (st.f_files != -1 &&
79 (st.f_files & 0xffffffff00000000ULL))
80 return -EOVERFLOW;
81 if (st.f_ffree != -1 &&
82 (st.f_ffree & 0xffffffff00000000ULL))
83 return -EOVERFLOW;
84 }
85
86 buf->f_type = st.f_type;
87 buf->f_bsize = st.f_bsize;
88 buf->f_blocks = st.f_blocks;
89 buf->f_bfree = st.f_bfree;
90 buf->f_bavail = st.f_bavail;
91 buf->f_files = st.f_files;
92 buf->f_ffree = st.f_ffree;
93 buf->f_fsid = st.f_fsid;
94 buf->f_namelen = st.f_namelen;
95 buf->f_frsize = st.f_frsize;
96 memset(buf->f_spare, 0, sizeof(buf->f_spare));
97 }
98 return 0;
99}
100
101static int vfs_statfs64(struct dentry *dentry, struct statfs64 *buf)
102{
103 struct kstatfs st;
104 int retval;
105
106 retval = vfs_statfs(dentry, &st);
107 if (retval)
108 return retval;
109
110 if (sizeof(*buf) == sizeof(st))
111 memcpy(buf, &st, sizeof(st));
112 else {
113 buf->f_type = st.f_type;
114 buf->f_bsize = st.f_bsize;
115 buf->f_blocks = st.f_blocks;
116 buf->f_bfree = st.f_bfree;
117 buf->f_bavail = st.f_bavail;
118 buf->f_files = st.f_files;
119 buf->f_ffree = st.f_ffree;
120 buf->f_fsid = st.f_fsid;
121 buf->f_namelen = st.f_namelen;
122 buf->f_frsize = st.f_frsize;
123 memset(buf->f_spare, 0, sizeof(buf->f_spare));
124 }
125 return 0;
126}
127
128SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct statfs __user *, buf)
129{
130 struct path path;
131 int error;
132
133 error = user_path(pathname, &path);
134 if (!error) {
135 struct statfs tmp;
136 error = vfs_statfs_native(path.dentry, &tmp);
137 if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
138 error = -EFAULT;
139 path_put(&path);
140 }
141 return error;
142}
143
144SYSCALL_DEFINE3(statfs64, const char __user *, pathname, size_t, sz, struct statfs64 __user *, buf)
145{
146 struct path path;
147 long error;
148
149 if (sz != sizeof(*buf))
150 return -EINVAL;
151 error = user_path(pathname, &path);
152 if (!error) {
153 struct statfs64 tmp;
154 error = vfs_statfs64(path.dentry, &tmp);
155 if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
156 error = -EFAULT;
157 path_put(&path);
158 }
159 return error;
160}
161
162SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct statfs __user *, buf)
163{
164 struct file * file;
165 struct statfs tmp;
166 int error;
167
168 error = -EBADF;
169 file = fget(fd);
170 if (!file)
171 goto out;
172 error = vfs_statfs_native(file->f_path.dentry, &tmp);
173 if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
174 error = -EFAULT;
175 fput(file);
176out:
177 return error;
178}
179
180SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user *, buf)
181{
182 struct file * file;
183 struct statfs64 tmp;
184 int error;
185
186 if (sz != sizeof(*buf))
187 return -EINVAL;
188
189 error = -EBADF;
190 file = fget(fd);
191 if (!file)
192 goto out;
193 error = vfs_statfs64(file->f_path.dentry, &tmp);
194 if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
195 error = -EFAULT;
196 fput(file);
197out:
198 return error;
199}
200
201int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, 35int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
202 struct file *filp) 36 struct file *filp)
203{ 37{
diff --git a/fs/partitions/acorn.c b/fs/partitions/acorn.c
index a97b477ac0fc..6921e7890be6 100644
--- a/fs/partitions/acorn.c
+++ b/fs/partitions/acorn.c
@@ -70,14 +70,14 @@ struct riscix_record {
70 70
71#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \ 71#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \
72 defined(CONFIG_ACORN_PARTITION_ADFS) 72 defined(CONFIG_ACORN_PARTITION_ADFS)
73static int 73static int riscix_partition(struct parsed_partitions *state,
74riscix_partition(struct parsed_partitions *state, struct block_device *bdev, 74 unsigned long first_sect, int slot,
75 unsigned long first_sect, int slot, unsigned long nr_sects) 75 unsigned long nr_sects)
76{ 76{
77 Sector sect; 77 Sector sect;
78 struct riscix_record *rr; 78 struct riscix_record *rr;
79 79
80 rr = (struct riscix_record *)read_dev_sector(bdev, first_sect, &sect); 80 rr = read_part_sector(state, first_sect, &sect);
81 if (!rr) 81 if (!rr)
82 return -1; 82 return -1;
83 83
@@ -123,9 +123,9 @@ struct linux_part {
123 123
124#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \ 124#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \
125 defined(CONFIG_ACORN_PARTITION_ADFS) 125 defined(CONFIG_ACORN_PARTITION_ADFS)
126static int 126static int linux_partition(struct parsed_partitions *state,
127linux_partition(struct parsed_partitions *state, struct block_device *bdev, 127 unsigned long first_sect, int slot,
128 unsigned long first_sect, int slot, unsigned long nr_sects) 128 unsigned long nr_sects)
129{ 129{
130 Sector sect; 130 Sector sect;
131 struct linux_part *linuxp; 131 struct linux_part *linuxp;
@@ -135,7 +135,7 @@ linux_partition(struct parsed_partitions *state, struct block_device *bdev,
135 135
136 put_partition(state, slot++, first_sect, size); 136 put_partition(state, slot++, first_sect, size);
137 137
138 linuxp = (struct linux_part *)read_dev_sector(bdev, first_sect, &sect); 138 linuxp = read_part_sector(state, first_sect, &sect);
139 if (!linuxp) 139 if (!linuxp)
140 return -1; 140 return -1;
141 141
@@ -157,8 +157,7 @@ linux_partition(struct parsed_partitions *state, struct block_device *bdev,
157#endif 157#endif
158 158
159#ifdef CONFIG_ACORN_PARTITION_CUMANA 159#ifdef CONFIG_ACORN_PARTITION_CUMANA
160int 160int adfspart_check_CUMANA(struct parsed_partitions *state)
161adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev)
162{ 161{
163 unsigned long first_sector = 0; 162 unsigned long first_sector = 0;
164 unsigned int start_blk = 0; 163 unsigned int start_blk = 0;
@@ -185,7 +184,7 @@ adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev
185 struct adfs_discrecord *dr; 184 struct adfs_discrecord *dr;
186 unsigned int nr_sects; 185 unsigned int nr_sects;
187 186
188 data = read_dev_sector(bdev, start_blk * 2 + 6, &sect); 187 data = read_part_sector(state, start_blk * 2 + 6, &sect);
189 if (!data) 188 if (!data)
190 return -1; 189 return -1;
191 190
@@ -217,14 +216,14 @@ adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev
217#ifdef CONFIG_ACORN_PARTITION_RISCIX 216#ifdef CONFIG_ACORN_PARTITION_RISCIX
218 case PARTITION_RISCIX_SCSI: 217 case PARTITION_RISCIX_SCSI:
219 /* RISCiX - we don't know how to find the next one. */ 218 /* RISCiX - we don't know how to find the next one. */
220 slot = riscix_partition(state, bdev, first_sector, 219 slot = riscix_partition(state, first_sector, slot,
221 slot, nr_sects); 220 nr_sects);
222 break; 221 break;
223#endif 222#endif
224 223
225 case PARTITION_LINUX: 224 case PARTITION_LINUX:
226 slot = linux_partition(state, bdev, first_sector, 225 slot = linux_partition(state, first_sector, slot,
227 slot, nr_sects); 226 nr_sects);
228 break; 227 break;
229 } 228 }
230 put_dev_sector(sect); 229 put_dev_sector(sect);
@@ -249,8 +248,7 @@ adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev
249 * hda1 = ADFS partition on first drive. 248 * hda1 = ADFS partition on first drive.
250 * hda2 = non-ADFS partition. 249 * hda2 = non-ADFS partition.
251 */ 250 */
252int 251int adfspart_check_ADFS(struct parsed_partitions *state)
253adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev)
254{ 252{
255 unsigned long start_sect, nr_sects, sectscyl, heads; 253 unsigned long start_sect, nr_sects, sectscyl, heads;
256 Sector sect; 254 Sector sect;
@@ -259,7 +257,7 @@ adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev)
259 unsigned char id; 257 unsigned char id;
260 int slot = 1; 258 int slot = 1;
261 259
262 data = read_dev_sector(bdev, 6, &sect); 260 data = read_part_sector(state, 6, &sect);
263 if (!data) 261 if (!data)
264 return -1; 262 return -1;
265 263
@@ -278,21 +276,21 @@ adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev)
278 /* 276 /*
279 * Work out start of non-adfs partition. 277 * Work out start of non-adfs partition.
280 */ 278 */
281 nr_sects = (bdev->bd_inode->i_size >> 9) - start_sect; 279 nr_sects = (state->bdev->bd_inode->i_size >> 9) - start_sect;
282 280
283 if (start_sect) { 281 if (start_sect) {
284 switch (id) { 282 switch (id) {
285#ifdef CONFIG_ACORN_PARTITION_RISCIX 283#ifdef CONFIG_ACORN_PARTITION_RISCIX
286 case PARTITION_RISCIX_SCSI: 284 case PARTITION_RISCIX_SCSI:
287 case PARTITION_RISCIX_MFM: 285 case PARTITION_RISCIX_MFM:
288 slot = riscix_partition(state, bdev, start_sect, 286 slot = riscix_partition(state, start_sect, slot,
289 slot, nr_sects); 287 nr_sects);
290 break; 288 break;
291#endif 289#endif
292 290
293 case PARTITION_LINUX: 291 case PARTITION_LINUX:
294 slot = linux_partition(state, bdev, start_sect, 292 slot = linux_partition(state, start_sect, slot,
295 slot, nr_sects); 293 nr_sects);
296 break; 294 break;
297 } 295 }
298 } 296 }
@@ -308,10 +306,11 @@ struct ics_part {
308 __le32 size; 306 __le32 size;
309}; 307};
310 308
311static int adfspart_check_ICSLinux(struct block_device *bdev, unsigned long block) 309static int adfspart_check_ICSLinux(struct parsed_partitions *state,
310 unsigned long block)
312{ 311{
313 Sector sect; 312 Sector sect;
314 unsigned char *data = read_dev_sector(bdev, block, &sect); 313 unsigned char *data = read_part_sector(state, block, &sect);
315 int result = 0; 314 int result = 0;
316 315
317 if (data) { 316 if (data) {
@@ -349,8 +348,7 @@ static inline int valid_ics_sector(const unsigned char *data)
349 * hda2 = ADFS partition 1 on first drive. 348 * hda2 = ADFS partition 1 on first drive.
350 * ..etc.. 349 * ..etc..
351 */ 350 */
352int 351int adfspart_check_ICS(struct parsed_partitions *state)
353adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev)
354{ 352{
355 const unsigned char *data; 353 const unsigned char *data;
356 const struct ics_part *p; 354 const struct ics_part *p;
@@ -360,7 +358,7 @@ adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev)
360 /* 358 /*
361 * Try ICS style partitions - sector 0 contains partition info. 359 * Try ICS style partitions - sector 0 contains partition info.
362 */ 360 */
363 data = read_dev_sector(bdev, 0, &sect); 361 data = read_part_sector(state, 0, &sect);
364 if (!data) 362 if (!data)
365 return -1; 363 return -1;
366 364
@@ -392,7 +390,7 @@ adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev)
392 * partition is. We must not make this visible 390 * partition is. We must not make this visible
393 * to the filesystem. 391 * to the filesystem.
394 */ 392 */
395 if (size > 1 && adfspart_check_ICSLinux(bdev, start)) { 393 if (size > 1 && adfspart_check_ICSLinux(state, start)) {
396 start += 1; 394 start += 1;
397 size -= 1; 395 size -= 1;
398 } 396 }
@@ -446,8 +444,7 @@ static inline int valid_ptec_sector(const unsigned char *data)
446 * hda2 = ADFS partition 1 on first drive. 444 * hda2 = ADFS partition 1 on first drive.
447 * ..etc.. 445 * ..etc..
448 */ 446 */
449int 447int adfspart_check_POWERTEC(struct parsed_partitions *state)
450adfspart_check_POWERTEC(struct parsed_partitions *state, struct block_device *bdev)
451{ 448{
452 Sector sect; 449 Sector sect;
453 const unsigned char *data; 450 const unsigned char *data;
@@ -455,7 +452,7 @@ adfspart_check_POWERTEC(struct parsed_partitions *state, struct block_device *bd
455 int slot = 1; 452 int slot = 1;
456 int i; 453 int i;
457 454
458 data = read_dev_sector(bdev, 0, &sect); 455 data = read_part_sector(state, 0, &sect);
459 if (!data) 456 if (!data)
460 return -1; 457 return -1;
461 458
@@ -508,8 +505,7 @@ static const char eesox_name[] = {
508 * 1. The individual ADFS boot block entries that are placed on the disk. 505 * 1. The individual ADFS boot block entries that are placed on the disk.
509 * 2. The start address of the next entry. 506 * 2. The start address of the next entry.
510 */ 507 */
511int 508int adfspart_check_EESOX(struct parsed_partitions *state)
512adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev)
513{ 509{
514 Sector sect; 510 Sector sect;
515 const unsigned char *data; 511 const unsigned char *data;
@@ -518,7 +514,7 @@ adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev)
518 sector_t start = 0; 514 sector_t start = 0;
519 int i, slot = 1; 515 int i, slot = 1;
520 516
521 data = read_dev_sector(bdev, 7, &sect); 517 data = read_part_sector(state, 7, &sect);
522 if (!data) 518 if (!data)
523 return -1; 519 return -1;
524 520
@@ -545,7 +541,7 @@ adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev)
545 if (i != 0) { 541 if (i != 0) {
546 sector_t size; 542 sector_t size;
547 543
548 size = get_capacity(bdev->bd_disk); 544 size = get_capacity(state->bdev->bd_disk);
549 put_partition(state, slot++, start, size - start); 545 put_partition(state, slot++, start, size - start);
550 printk("\n"); 546 printk("\n");
551 } 547 }
diff --git a/fs/partitions/acorn.h b/fs/partitions/acorn.h
index 81fd50ecc080..ede828529692 100644
--- a/fs/partitions/acorn.h
+++ b/fs/partitions/acorn.h
@@ -7,8 +7,8 @@
7 * format, and everyone stick to it? 7 * format, and everyone stick to it?
8 */ 8 */
9 9
10int adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev); 10int adfspart_check_CUMANA(struct parsed_partitions *state);
11int adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev); 11int adfspart_check_ADFS(struct parsed_partitions *state);
12int adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev); 12int adfspart_check_ICS(struct parsed_partitions *state);
13int adfspart_check_POWERTEC(struct parsed_partitions *state, struct block_device *bdev); 13int adfspart_check_POWERTEC(struct parsed_partitions *state);
14int adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev); 14int adfspart_check_EESOX(struct parsed_partitions *state);
diff --git a/fs/partitions/amiga.c b/fs/partitions/amiga.c
index 9917a8c360f2..ba443d4229f8 100644
--- a/fs/partitions/amiga.c
+++ b/fs/partitions/amiga.c
@@ -23,8 +23,7 @@ checksum_block(__be32 *m, int size)
23 return sum; 23 return sum;
24} 24}
25 25
26int 26int amiga_partition(struct parsed_partitions *state)
27amiga_partition(struct parsed_partitions *state, struct block_device *bdev)
28{ 27{
29 Sector sect; 28 Sector sect;
30 unsigned char *data; 29 unsigned char *data;
@@ -38,11 +37,11 @@ amiga_partition(struct parsed_partitions *state, struct block_device *bdev)
38 for (blk = 0; ; blk++, put_dev_sector(sect)) { 37 for (blk = 0; ; blk++, put_dev_sector(sect)) {
39 if (blk == RDB_ALLOCATION_LIMIT) 38 if (blk == RDB_ALLOCATION_LIMIT)
40 goto rdb_done; 39 goto rdb_done;
41 data = read_dev_sector(bdev, blk, &sect); 40 data = read_part_sector(state, blk, &sect);
42 if (!data) { 41 if (!data) {
43 if (warn_no_part) 42 if (warn_no_part)
44 printk("Dev %s: unable to read RDB block %d\n", 43 printk("Dev %s: unable to read RDB block %d\n",
45 bdevname(bdev, b), blk); 44 bdevname(state->bdev, b), blk);
46 res = -1; 45 res = -1;
47 goto rdb_done; 46 goto rdb_done;
48 } 47 }
@@ -64,7 +63,7 @@ amiga_partition(struct parsed_partitions *state, struct block_device *bdev)
64 } 63 }
65 64
66 printk("Dev %s: RDB in block %d has bad checksum\n", 65 printk("Dev %s: RDB in block %d has bad checksum\n",
67 bdevname(bdev, b), blk); 66 bdevname(state->bdev, b), blk);
68 } 67 }
69 68
70 /* blksize is blocks per 512 byte standard block */ 69 /* blksize is blocks per 512 byte standard block */
@@ -75,11 +74,11 @@ amiga_partition(struct parsed_partitions *state, struct block_device *bdev)
75 put_dev_sector(sect); 74 put_dev_sector(sect);
76 for (part = 1; blk>0 && part<=16; part++, put_dev_sector(sect)) { 75 for (part = 1; blk>0 && part<=16; part++, put_dev_sector(sect)) {
77 blk *= blksize; /* Read in terms partition table understands */ 76 blk *= blksize; /* Read in terms partition table understands */
78 data = read_dev_sector(bdev, blk, &sect); 77 data = read_part_sector(state, blk, &sect);
79 if (!data) { 78 if (!data) {
80 if (warn_no_part) 79 if (warn_no_part)
81 printk("Dev %s: unable to read partition block %d\n", 80 printk("Dev %s: unable to read partition block %d\n",
82 bdevname(bdev, b), blk); 81 bdevname(state->bdev, b), blk);
83 res = -1; 82 res = -1;
84 goto rdb_done; 83 goto rdb_done;
85 } 84 }
diff --git a/fs/partitions/amiga.h b/fs/partitions/amiga.h
index 2f3e9ce22d53..d094585cadaa 100644
--- a/fs/partitions/amiga.h
+++ b/fs/partitions/amiga.h
@@ -2,5 +2,5 @@
2 * fs/partitions/amiga.h 2 * fs/partitions/amiga.h
3 */ 3 */
4 4
5int amiga_partition(struct parsed_partitions *state, struct block_device *bdev); 5int amiga_partition(struct parsed_partitions *state);
6 6
diff --git a/fs/partitions/atari.c b/fs/partitions/atari.c
index 1f3572d5b755..4439ff1b6cec 100644
--- a/fs/partitions/atari.c
+++ b/fs/partitions/atari.c
@@ -30,7 +30,7 @@ static inline int OK_id(char *s)
30 memcmp (s, "RAW", 3) == 0 ; 30 memcmp (s, "RAW", 3) == 0 ;
31} 31}
32 32
33int atari_partition(struct parsed_partitions *state, struct block_device *bdev) 33int atari_partition(struct parsed_partitions *state)
34{ 34{
35 Sector sect; 35 Sector sect;
36 struct rootsector *rs; 36 struct rootsector *rs;
@@ -42,12 +42,12 @@ int atari_partition(struct parsed_partitions *state, struct block_device *bdev)
42 int part_fmt = 0; /* 0:unknown, 1:AHDI, 2:ICD/Supra */ 42 int part_fmt = 0; /* 0:unknown, 1:AHDI, 2:ICD/Supra */
43#endif 43#endif
44 44
45 rs = (struct rootsector *) read_dev_sector(bdev, 0, &sect); 45 rs = read_part_sector(state, 0, &sect);
46 if (!rs) 46 if (!rs)
47 return -1; 47 return -1;
48 48
49 /* Verify this is an Atari rootsector: */ 49 /* Verify this is an Atari rootsector: */
50 hd_size = bdev->bd_inode->i_size >> 9; 50 hd_size = state->bdev->bd_inode->i_size >> 9;
51 if (!VALID_PARTITION(&rs->part[0], hd_size) && 51 if (!VALID_PARTITION(&rs->part[0], hd_size) &&
52 !VALID_PARTITION(&rs->part[1], hd_size) && 52 !VALID_PARTITION(&rs->part[1], hd_size) &&
53 !VALID_PARTITION(&rs->part[2], hd_size) && 53 !VALID_PARTITION(&rs->part[2], hd_size) &&
@@ -84,7 +84,7 @@ int atari_partition(struct parsed_partitions *state, struct block_device *bdev)
84 printk(" XGM<"); 84 printk(" XGM<");
85 partsect = extensect = be32_to_cpu(pi->st); 85 partsect = extensect = be32_to_cpu(pi->st);
86 while (1) { 86 while (1) {
87 xrs = (struct rootsector *)read_dev_sector(bdev, partsect, &sect2); 87 xrs = read_part_sector(state, partsect, &sect2);
88 if (!xrs) { 88 if (!xrs) {
89 printk (" block %ld read failed\n", partsect); 89 printk (" block %ld read failed\n", partsect);
90 put_dev_sector(sect); 90 put_dev_sector(sect);
diff --git a/fs/partitions/atari.h b/fs/partitions/atari.h
index 63186b00e135..fe2d32a89f36 100644
--- a/fs/partitions/atari.h
+++ b/fs/partitions/atari.h
@@ -31,4 +31,4 @@ struct rootsector
31 u16 checksum; /* checksum for bootable disks */ 31 u16 checksum; /* checksum for bootable disks */
32} __attribute__((__packed__)); 32} __attribute__((__packed__));
33 33
34int atari_partition(struct parsed_partitions *state, struct block_device *bdev); 34int atari_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index e8865c11777f..5dcd4b0c5533 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -16,6 +16,7 @@
16#include <linux/init.h> 16#include <linux/init.h>
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/fs.h> 18#include <linux/fs.h>
19#include <linux/slab.h>
19#include <linux/kmod.h> 20#include <linux/kmod.h>
20#include <linux/ctype.h> 21#include <linux/ctype.h>
21#include <linux/genhd.h> 22#include <linux/genhd.h>
@@ -44,7 +45,7 @@ extern void md_autodetect_dev(dev_t dev);
44 45
45int warn_no_part = 1; /*This is ugly: should make genhd removable media aware*/ 46int warn_no_part = 1; /*This is ugly: should make genhd removable media aware*/
46 47
47static int (*check_part[])(struct parsed_partitions *, struct block_device *) = { 48static int (*check_part[])(struct parsed_partitions *) = {
48 /* 49 /*
49 * Probe partition formats with tables at disk address 0 50 * Probe partition formats with tables at disk address 0
50 * that also have an ADFS boot block at 0xdc0. 51 * that also have an ADFS boot block at 0xdc0.
@@ -160,10 +161,11 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
160 struct parsed_partitions *state; 161 struct parsed_partitions *state;
161 int i, res, err; 162 int i, res, err;
162 163
163 state = kmalloc(sizeof(struct parsed_partitions), GFP_KERNEL); 164 state = kzalloc(sizeof(struct parsed_partitions), GFP_KERNEL);
164 if (!state) 165 if (!state)
165 return NULL; 166 return NULL;
166 167
168 state->bdev = bdev;
167 disk_name(hd, 0, state->name); 169 disk_name(hd, 0, state->name);
168 printk(KERN_INFO " %s:", state->name); 170 printk(KERN_INFO " %s:", state->name);
169 if (isdigit(state->name[strlen(state->name)-1])) 171 if (isdigit(state->name[strlen(state->name)-1]))
@@ -173,7 +175,7 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
173 i = res = err = 0; 175 i = res = err = 0;
174 while (!res && check_part[i]) { 176 while (!res && check_part[i]) {
175 memset(&state->parts, 0, sizeof(state->parts)); 177 memset(&state->parts, 0, sizeof(state->parts));
176 res = check_part[i++](state, bdev); 178 res = check_part[i++](state);
177 if (res < 0) { 179 if (res < 0) {
178 /* We have hit an I/O error which we don't report now. 180 /* We have hit an I/O error which we don't report now.
179 * But record it, and let the others do their job. 181 * But record it, and let the others do their job.
@@ -185,6 +187,8 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
185 } 187 }
186 if (res > 0) 188 if (res > 0)
187 return state; 189 return state;
190 if (state->access_beyond_eod)
191 err = -ENOSPC;
188 if (err) 192 if (err)
189 /* The partition is unrecognized. So report I/O errors if there were any */ 193 /* The partition is unrecognized. So report I/O errors if there were any */
190 res = err; 194 res = err;
@@ -537,12 +541,33 @@ exit:
537 disk_part_iter_exit(&piter); 541 disk_part_iter_exit(&piter);
538} 542}
539 543
544static bool disk_unlock_native_capacity(struct gendisk *disk)
545{
546 const struct block_device_operations *bdops = disk->fops;
547
548 if (bdops->unlock_native_capacity &&
549 !(disk->flags & GENHD_FL_NATIVE_CAPACITY)) {
550 printk(KERN_CONT "enabling native capacity\n");
551 bdops->unlock_native_capacity(disk);
552 disk->flags |= GENHD_FL_NATIVE_CAPACITY;
553 return true;
554 } else {
555 printk(KERN_CONT "truncated\n");
556 return false;
557 }
558}
559
540int rescan_partitions(struct gendisk *disk, struct block_device *bdev) 560int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
541{ 561{
562 struct parsed_partitions *state = NULL;
542 struct disk_part_iter piter; 563 struct disk_part_iter piter;
543 struct hd_struct *part; 564 struct hd_struct *part;
544 struct parsed_partitions *state;
545 int p, highest, res; 565 int p, highest, res;
566rescan:
567 if (state && !IS_ERR(state)) {
568 kfree(state);
569 state = NULL;
570 }
546 571
547 if (bdev->bd_part_count) 572 if (bdev->bd_part_count)
548 return -EBUSY; 573 return -EBUSY;
@@ -561,8 +586,32 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
561 bdev->bd_invalidated = 0; 586 bdev->bd_invalidated = 0;
562 if (!get_capacity(disk) || !(state = check_partition(disk, bdev))) 587 if (!get_capacity(disk) || !(state = check_partition(disk, bdev)))
563 return 0; 588 return 0;
564 if (IS_ERR(state)) /* I/O error reading the partition table */ 589 if (IS_ERR(state)) {
590 /*
591 * I/O error reading the partition table. If any
592 * partition code tried to read beyond EOD, retry
593 * after unlocking native capacity.
594 */
595 if (PTR_ERR(state) == -ENOSPC) {
596 printk(KERN_WARNING "%s: partition table beyond EOD, ",
597 disk->disk_name);
598 if (disk_unlock_native_capacity(disk))
599 goto rescan;
600 }
565 return -EIO; 601 return -EIO;
602 }
603 /*
604 * If any partition code tried to read beyond EOD, try
605 * unlocking native capacity even if partition table is
606 * sucessfully read as we could be missing some partitions.
607 */
608 if (state->access_beyond_eod) {
609 printk(KERN_WARNING
610 "%s: partition table partially beyond EOD, ",
611 disk->disk_name);
612 if (disk_unlock_native_capacity(disk))
613 goto rescan;
614 }
566 615
567 /* tell userspace that the media / partition table may have changed */ 616 /* tell userspace that the media / partition table may have changed */
568 kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); 617 kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);
@@ -580,7 +629,7 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
580 /* add partitions */ 629 /* add partitions */
581 for (p = 1; p < state->limit; p++) { 630 for (p = 1; p < state->limit; p++) {
582 sector_t size, from; 631 sector_t size, from;
583try_scan: 632
584 size = state->parts[p].size; 633 size = state->parts[p].size;
585 if (!size) 634 if (!size)
586 continue; 635 continue;
@@ -588,30 +637,21 @@ try_scan:
588 from = state->parts[p].from; 637 from = state->parts[p].from;
589 if (from >= get_capacity(disk)) { 638 if (from >= get_capacity(disk)) {
590 printk(KERN_WARNING 639 printk(KERN_WARNING
591 "%s: p%d ignored, start %llu is behind the end of the disk\n", 640 "%s: p%d start %llu is beyond EOD, ",
592 disk->disk_name, p, (unsigned long long) from); 641 disk->disk_name, p, (unsigned long long) from);
642 if (disk_unlock_native_capacity(disk))
643 goto rescan;
593 continue; 644 continue;
594 } 645 }
595 646
596 if (from + size > get_capacity(disk)) { 647 if (from + size > get_capacity(disk)) {
597 const struct block_device_operations *bdops = disk->fops;
598 unsigned long long capacity;
599
600 printk(KERN_WARNING 648 printk(KERN_WARNING
601 "%s: p%d size %llu exceeds device capacity, ", 649 "%s: p%d size %llu extends beyond EOD, ",
602 disk->disk_name, p, (unsigned long long) size); 650 disk->disk_name, p, (unsigned long long) size);
603 651
604 if (bdops->set_capacity && 652 if (disk_unlock_native_capacity(disk)) {
605 (disk->flags & GENHD_FL_NATIVE_CAPACITY) == 0) { 653 /* free state and restart */
606 printk(KERN_CONT "enabling native capacity\n"); 654 goto rescan;
607 capacity = bdops->set_capacity(disk, ~0ULL);
608 disk->flags |= GENHD_FL_NATIVE_CAPACITY;
609 if (capacity > get_capacity(disk)) {
610 set_capacity(disk, capacity);
611 check_disk_size_change(disk, bdev);
612 bdev->bd_invalidated = 0;
613 }
614 goto try_scan;
615 } else { 655 } else {
616 /* 656 /*
617 * we can not ignore partitions of broken tables 657 * we can not ignore partitions of broken tables
@@ -619,7 +659,6 @@ try_scan:
619 * we limit them to the end of the disk to avoid 659 * we limit them to the end of the disk to avoid
620 * creating invalid block devices 660 * creating invalid block devices
621 */ 661 */
622 printk(KERN_CONT "limited to end of disk\n");
623 size = get_capacity(disk) - from; 662 size = get_capacity(disk) - from;
624 } 663 }
625 } 664 }
diff --git a/fs/partitions/check.h b/fs/partitions/check.h
index 98dbe1a84528..52f8bd399396 100644
--- a/fs/partitions/check.h
+++ b/fs/partitions/check.h
@@ -6,6 +6,7 @@
6 * description. 6 * description.
7 */ 7 */
8struct parsed_partitions { 8struct parsed_partitions {
9 struct block_device *bdev;
9 char name[BDEVNAME_SIZE]; 10 char name[BDEVNAME_SIZE];
10 struct { 11 struct {
11 sector_t from; 12 sector_t from;
@@ -14,8 +15,19 @@ struct parsed_partitions {
14 } parts[DISK_MAX_PARTS]; 15 } parts[DISK_MAX_PARTS];
15 int next; 16 int next;
16 int limit; 17 int limit;
18 bool access_beyond_eod;
17}; 19};
18 20
21static inline void *read_part_sector(struct parsed_partitions *state,
22 sector_t n, Sector *p)
23{
24 if (n >= get_capacity(state->bdev->bd_disk)) {
25 state->access_beyond_eod = true;
26 return NULL;
27 }
28 return read_dev_sector(state->bdev, n, p);
29}
30
19static inline void 31static inline void
20put_partition(struct parsed_partitions *p, int n, sector_t from, sector_t size) 32put_partition(struct parsed_partitions *p, int n, sector_t from, sector_t size)
21{ 33{
diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c
index 49cfd5f54238..9efb2cfe2410 100644
--- a/fs/partitions/efi.c
+++ b/fs/partitions/efi.c
@@ -95,6 +95,7 @@
95 ************************************************************/ 95 ************************************************************/
96#include <linux/crc32.h> 96#include <linux/crc32.h>
97#include <linux/math64.h> 97#include <linux/math64.h>
98#include <linux/slab.h>
98#include "check.h" 99#include "check.h"
99#include "efi.h" 100#include "efi.h"
100 101
@@ -139,8 +140,7 @@ efi_crc32(const void *buf, unsigned long len)
139 * the part[0] entry for this disk, and is the number of 140 * the part[0] entry for this disk, and is the number of
140 * physical sectors available on the disk. 141 * physical sectors available on the disk.
141 */ 142 */
142static u64 143static u64 last_lba(struct block_device *bdev)
143last_lba(struct block_device *bdev)
144{ 144{
145 if (!bdev || !bdev->bd_inode) 145 if (!bdev || !bdev->bd_inode)
146 return 0; 146 return 0;
@@ -180,27 +180,28 @@ is_pmbr_valid(legacy_mbr *mbr)
180 180
181/** 181/**
182 * read_lba(): Read bytes from disk, starting at given LBA 182 * read_lba(): Read bytes from disk, starting at given LBA
183 * @bdev 183 * @state
184 * @lba 184 * @lba
185 * @buffer 185 * @buffer
186 * @size_t 186 * @size_t
187 * 187 *
188 * Description: Reads @count bytes from @bdev into @buffer. 188 * Description: Reads @count bytes from @state->bdev into @buffer.
189 * Returns number of bytes read on success, 0 on error. 189 * Returns number of bytes read on success, 0 on error.
190 */ 190 */
191static size_t 191static size_t read_lba(struct parsed_partitions *state,
192read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count) 192 u64 lba, u8 *buffer, size_t count)
193{ 193{
194 size_t totalreadcount = 0; 194 size_t totalreadcount = 0;
195 struct block_device *bdev = state->bdev;
195 sector_t n = lba * (bdev_logical_block_size(bdev) / 512); 196 sector_t n = lba * (bdev_logical_block_size(bdev) / 512);
196 197
197 if (!bdev || !buffer || lba > last_lba(bdev)) 198 if (!buffer || lba > last_lba(bdev))
198 return 0; 199 return 0;
199 200
200 while (count) { 201 while (count) {
201 int copied = 512; 202 int copied = 512;
202 Sector sect; 203 Sector sect;
203 unsigned char *data = read_dev_sector(bdev, n++, &sect); 204 unsigned char *data = read_part_sector(state, n++, &sect);
204 if (!data) 205 if (!data)
205 break; 206 break;
206 if (copied > count) 207 if (copied > count)
@@ -216,19 +217,20 @@ read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count)
216 217
217/** 218/**
218 * alloc_read_gpt_entries(): reads partition entries from disk 219 * alloc_read_gpt_entries(): reads partition entries from disk
219 * @bdev 220 * @state
220 * @gpt - GPT header 221 * @gpt - GPT header
221 * 222 *
222 * Description: Returns ptes on success, NULL on error. 223 * Description: Returns ptes on success, NULL on error.
223 * Allocates space for PTEs based on information found in @gpt. 224 * Allocates space for PTEs based on information found in @gpt.
224 * Notes: remember to free pte when you're done! 225 * Notes: remember to free pte when you're done!
225 */ 226 */
226static gpt_entry * 227static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state,
227alloc_read_gpt_entries(struct block_device *bdev, gpt_header *gpt) 228 gpt_header *gpt)
228{ 229{
229 size_t count; 230 size_t count;
230 gpt_entry *pte; 231 gpt_entry *pte;
231 if (!bdev || !gpt) 232
233 if (!gpt)
232 return NULL; 234 return NULL;
233 235
234 count = le32_to_cpu(gpt->num_partition_entries) * 236 count = le32_to_cpu(gpt->num_partition_entries) *
@@ -239,7 +241,7 @@ alloc_read_gpt_entries(struct block_device *bdev, gpt_header *gpt)
239 if (!pte) 241 if (!pte)
240 return NULL; 242 return NULL;
241 243
242 if (read_lba(bdev, le64_to_cpu(gpt->partition_entry_lba), 244 if (read_lba(state, le64_to_cpu(gpt->partition_entry_lba),
243 (u8 *) pte, 245 (u8 *) pte,
244 count) < count) { 246 count) < count) {
245 kfree(pte); 247 kfree(pte);
@@ -251,27 +253,24 @@ alloc_read_gpt_entries(struct block_device *bdev, gpt_header *gpt)
251 253
252/** 254/**
253 * alloc_read_gpt_header(): Allocates GPT header, reads into it from disk 255 * alloc_read_gpt_header(): Allocates GPT header, reads into it from disk
254 * @bdev 256 * @state
255 * @lba is the Logical Block Address of the partition table 257 * @lba is the Logical Block Address of the partition table
256 * 258 *
257 * Description: returns GPT header on success, NULL on error. Allocates 259 * Description: returns GPT header on success, NULL on error. Allocates
258 * and fills a GPT header starting at @ from @bdev. 260 * and fills a GPT header starting at @ from @state->bdev.
259 * Note: remember to free gpt when finished with it. 261 * Note: remember to free gpt when finished with it.
260 */ 262 */
261static gpt_header * 263static gpt_header *alloc_read_gpt_header(struct parsed_partitions *state,
262alloc_read_gpt_header(struct block_device *bdev, u64 lba) 264 u64 lba)
263{ 265{
264 gpt_header *gpt; 266 gpt_header *gpt;
265 unsigned ssz = bdev_logical_block_size(bdev); 267 unsigned ssz = bdev_logical_block_size(state->bdev);
266
267 if (!bdev)
268 return NULL;
269 268
270 gpt = kzalloc(ssz, GFP_KERNEL); 269 gpt = kzalloc(ssz, GFP_KERNEL);
271 if (!gpt) 270 if (!gpt)
272 return NULL; 271 return NULL;
273 272
274 if (read_lba(bdev, lba, (u8 *) gpt, ssz) < ssz) { 273 if (read_lba(state, lba, (u8 *) gpt, ssz) < ssz) {
275 kfree(gpt); 274 kfree(gpt);
276 gpt=NULL; 275 gpt=NULL;
277 return NULL; 276 return NULL;
@@ -282,7 +281,7 @@ alloc_read_gpt_header(struct block_device *bdev, u64 lba)
282 281
283/** 282/**
284 * is_gpt_valid() - tests one GPT header and PTEs for validity 283 * is_gpt_valid() - tests one GPT header and PTEs for validity
285 * @bdev 284 * @state
286 * @lba is the logical block address of the GPT header to test 285 * @lba is the logical block address of the GPT header to test
287 * @gpt is a GPT header ptr, filled on return. 286 * @gpt is a GPT header ptr, filled on return.
288 * @ptes is a PTEs ptr, filled on return. 287 * @ptes is a PTEs ptr, filled on return.
@@ -290,16 +289,15 @@ alloc_read_gpt_header(struct block_device *bdev, u64 lba)
290 * Description: returns 1 if valid, 0 on error. 289 * Description: returns 1 if valid, 0 on error.
291 * If valid, returns pointers to newly allocated GPT header and PTEs. 290 * If valid, returns pointers to newly allocated GPT header and PTEs.
292 */ 291 */
293static int 292static int is_gpt_valid(struct parsed_partitions *state, u64 lba,
294is_gpt_valid(struct block_device *bdev, u64 lba, 293 gpt_header **gpt, gpt_entry **ptes)
295 gpt_header **gpt, gpt_entry **ptes)
296{ 294{
297 u32 crc, origcrc; 295 u32 crc, origcrc;
298 u64 lastlba; 296 u64 lastlba;
299 297
300 if (!bdev || !gpt || !ptes) 298 if (!ptes)
301 return 0; 299 return 0;
302 if (!(*gpt = alloc_read_gpt_header(bdev, lba))) 300 if (!(*gpt = alloc_read_gpt_header(state, lba)))
303 return 0; 301 return 0;
304 302
305 /* Check the GUID Partition Table signature */ 303 /* Check the GUID Partition Table signature */
@@ -335,7 +333,7 @@ is_gpt_valid(struct block_device *bdev, u64 lba,
335 /* Check the first_usable_lba and last_usable_lba are 333 /* Check the first_usable_lba and last_usable_lba are
336 * within the disk. 334 * within the disk.
337 */ 335 */
338 lastlba = last_lba(bdev); 336 lastlba = last_lba(state->bdev);
339 if (le64_to_cpu((*gpt)->first_usable_lba) > lastlba) { 337 if (le64_to_cpu((*gpt)->first_usable_lba) > lastlba) {
340 pr_debug("GPT: first_usable_lba incorrect: %lld > %lld\n", 338 pr_debug("GPT: first_usable_lba incorrect: %lld > %lld\n",
341 (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba), 339 (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba),
@@ -349,7 +347,7 @@ is_gpt_valid(struct block_device *bdev, u64 lba,
349 goto fail; 347 goto fail;
350 } 348 }
351 349
352 if (!(*ptes = alloc_read_gpt_entries(bdev, *gpt))) 350 if (!(*ptes = alloc_read_gpt_entries(state, *gpt)))
353 goto fail; 351 goto fail;
354 352
355 /* Check the GUID Partition Entry Array CRC */ 353 /* Check the GUID Partition Entry Array CRC */
@@ -494,7 +492,7 @@ compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba)
494 492
495/** 493/**
496 * find_valid_gpt() - Search disk for valid GPT headers and PTEs 494 * find_valid_gpt() - Search disk for valid GPT headers and PTEs
497 * @bdev 495 * @state
498 * @gpt is a GPT header ptr, filled on return. 496 * @gpt is a GPT header ptr, filled on return.
499 * @ptes is a PTEs ptr, filled on return. 497 * @ptes is a PTEs ptr, filled on return.
500 * Description: Returns 1 if valid, 0 on error. 498 * Description: Returns 1 if valid, 0 on error.
@@ -507,24 +505,25 @@ compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba)
507 * This protects against devices which misreport their size, and forces 505 * This protects against devices which misreport their size, and forces
508 * the user to decide to use the Alternate GPT. 506 * the user to decide to use the Alternate GPT.
509 */ 507 */
510static int 508static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt,
511find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes) 509 gpt_entry **ptes)
512{ 510{
513 int good_pgpt = 0, good_agpt = 0, good_pmbr = 0; 511 int good_pgpt = 0, good_agpt = 0, good_pmbr = 0;
514 gpt_header *pgpt = NULL, *agpt = NULL; 512 gpt_header *pgpt = NULL, *agpt = NULL;
515 gpt_entry *pptes = NULL, *aptes = NULL; 513 gpt_entry *pptes = NULL, *aptes = NULL;
516 legacy_mbr *legacymbr; 514 legacy_mbr *legacymbr;
517 u64 lastlba; 515 u64 lastlba;
518 if (!bdev || !gpt || !ptes) 516
517 if (!ptes)
519 return 0; 518 return 0;
520 519
521 lastlba = last_lba(bdev); 520 lastlba = last_lba(state->bdev);
522 if (!force_gpt) { 521 if (!force_gpt) {
523 /* This will be added to the EFI Spec. per Intel after v1.02. */ 522 /* This will be added to the EFI Spec. per Intel after v1.02. */
524 legacymbr = kzalloc(sizeof (*legacymbr), GFP_KERNEL); 523 legacymbr = kzalloc(sizeof (*legacymbr), GFP_KERNEL);
525 if (legacymbr) { 524 if (legacymbr) {
526 read_lba(bdev, 0, (u8 *) legacymbr, 525 read_lba(state, 0, (u8 *) legacymbr,
527 sizeof (*legacymbr)); 526 sizeof (*legacymbr));
528 good_pmbr = is_pmbr_valid(legacymbr); 527 good_pmbr = is_pmbr_valid(legacymbr);
529 kfree(legacymbr); 528 kfree(legacymbr);
530 } 529 }
@@ -532,15 +531,14 @@ find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes)
532 goto fail; 531 goto fail;
533 } 532 }
534 533
535 good_pgpt = is_gpt_valid(bdev, GPT_PRIMARY_PARTITION_TABLE_LBA, 534 good_pgpt = is_gpt_valid(state, GPT_PRIMARY_PARTITION_TABLE_LBA,
536 &pgpt, &pptes); 535 &pgpt, &pptes);
537 if (good_pgpt) 536 if (good_pgpt)
538 good_agpt = is_gpt_valid(bdev, 537 good_agpt = is_gpt_valid(state,
539 le64_to_cpu(pgpt->alternate_lba), 538 le64_to_cpu(pgpt->alternate_lba),
540 &agpt, &aptes); 539 &agpt, &aptes);
541 if (!good_agpt && force_gpt) 540 if (!good_agpt && force_gpt)
542 good_agpt = is_gpt_valid(bdev, lastlba, 541 good_agpt = is_gpt_valid(state, lastlba, &agpt, &aptes);
543 &agpt, &aptes);
544 542
545 /* The obviously unsuccessful case */ 543 /* The obviously unsuccessful case */
546 if (!good_pgpt && !good_agpt) 544 if (!good_pgpt && !good_agpt)
@@ -582,9 +580,8 @@ find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes)
582} 580}
583 581
584/** 582/**
585 * efi_partition(struct parsed_partitions *state, struct block_device *bdev) 583 * efi_partition(struct parsed_partitions *state)
586 * @state 584 * @state
587 * @bdev
588 * 585 *
589 * Description: called from check.c, if the disk contains GPT 586 * Description: called from check.c, if the disk contains GPT
590 * partitions, sets up partition entries in the kernel. 587 * partitions, sets up partition entries in the kernel.
@@ -601,15 +598,14 @@ find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes)
601 * 1 if successful 598 * 1 if successful
602 * 599 *
603 */ 600 */
604int 601int efi_partition(struct parsed_partitions *state)
605efi_partition(struct parsed_partitions *state, struct block_device *bdev)
606{ 602{
607 gpt_header *gpt = NULL; 603 gpt_header *gpt = NULL;
608 gpt_entry *ptes = NULL; 604 gpt_entry *ptes = NULL;
609 u32 i; 605 u32 i;
610 unsigned ssz = bdev_logical_block_size(bdev) / 512; 606 unsigned ssz = bdev_logical_block_size(state->bdev) / 512;
611 607
612 if (!find_valid_gpt(bdev, &gpt, &ptes) || !gpt || !ptes) { 608 if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) {
613 kfree(gpt); 609 kfree(gpt);
614 kfree(ptes); 610 kfree(ptes);
615 return 0; 611 return 0;
@@ -622,7 +618,7 @@ efi_partition(struct parsed_partitions *state, struct block_device *bdev)
622 u64 size = le64_to_cpu(ptes[i].ending_lba) - 618 u64 size = le64_to_cpu(ptes[i].ending_lba) -
623 le64_to_cpu(ptes[i].starting_lba) + 1ULL; 619 le64_to_cpu(ptes[i].starting_lba) + 1ULL;
624 620
625 if (!is_pte_valid(&ptes[i], last_lba(bdev))) 621 if (!is_pte_valid(&ptes[i], last_lba(state->bdev)))
626 continue; 622 continue;
627 623
628 put_partition(state, i+1, start * ssz, size * ssz); 624 put_partition(state, i+1, start * ssz, size * ssz);
@@ -630,7 +626,7 @@ efi_partition(struct parsed_partitions *state, struct block_device *bdev)
630 /* If this is a RAID volume, tell md */ 626 /* If this is a RAID volume, tell md */
631 if (!efi_guidcmp(ptes[i].partition_type_guid, 627 if (!efi_guidcmp(ptes[i].partition_type_guid,
632 PARTITION_LINUX_RAID_GUID)) 628 PARTITION_LINUX_RAID_GUID))
633 state->parts[i+1].flags = 1; 629 state->parts[i + 1].flags = ADDPART_FLAG_RAID;
634 } 630 }
635 kfree(ptes); 631 kfree(ptes);
636 kfree(gpt); 632 kfree(gpt);
diff --git a/fs/partitions/efi.h b/fs/partitions/efi.h
index 6998b589abf9..b69ab729558f 100644
--- a/fs/partitions/efi.h
+++ b/fs/partitions/efi.h
@@ -110,7 +110,7 @@ typedef struct _legacy_mbr {
110} __attribute__ ((packed)) legacy_mbr; 110} __attribute__ ((packed)) legacy_mbr;
111 111
112/* Functions */ 112/* Functions */
113extern int efi_partition(struct parsed_partitions *state, struct block_device *bdev); 113extern int efi_partition(struct parsed_partitions *state);
114 114
115#endif 115#endif
116 116
diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c
index fc71aab08460..3e73de5967ff 100644
--- a/fs/partitions/ibm.c
+++ b/fs/partitions/ibm.c
@@ -58,9 +58,9 @@ cchhb2blk (struct vtoc_cchhb *ptr, struct hd_geometry *geo) {
58 58
59/* 59/*
60 */ 60 */
61int 61int ibm_partition(struct parsed_partitions *state)
62ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
63{ 62{
63 struct block_device *bdev = state->bdev;
64 int blocksize, res; 64 int blocksize, res;
65 loff_t i_size, offset, size, fmt_size; 65 loff_t i_size, offset, size, fmt_size;
66 dasd_information2_t *info; 66 dasd_information2_t *info;
@@ -100,7 +100,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
100 /* 100 /*
101 * Get volume label, extract name and type. 101 * Get volume label, extract name and type.
102 */ 102 */
103 data = read_dev_sector(bdev, info->label_block*(blocksize/512), &sect); 103 data = read_part_sector(state, info->label_block*(blocksize/512),
104 &sect);
104 if (data == NULL) 105 if (data == NULL)
105 goto out_readerr; 106 goto out_readerr;
106 107
@@ -193,8 +194,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
193 */ 194 */
194 blk = cchhb2blk(&label->vol.vtoc, geo) + 1; 195 blk = cchhb2blk(&label->vol.vtoc, geo) + 1;
195 counter = 0; 196 counter = 0;
196 data = read_dev_sector(bdev, blk * (blocksize/512), 197 data = read_part_sector(state, blk * (blocksize/512),
197 &sect); 198 &sect);
198 while (data != NULL) { 199 while (data != NULL) {
199 struct vtoc_format1_label f1; 200 struct vtoc_format1_label f1;
200 201
@@ -208,9 +209,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
208 || f1.DS1FMTID == _ascebc['7'] 209 || f1.DS1FMTID == _ascebc['7']
209 || f1.DS1FMTID == _ascebc['9']) { 210 || f1.DS1FMTID == _ascebc['9']) {
210 blk++; 211 blk++;
211 data = read_dev_sector(bdev, blk * 212 data = read_part_sector(state,
212 (blocksize/512), 213 blk * (blocksize/512), &sect);
213 &sect);
214 continue; 214 continue;
215 } 215 }
216 216
@@ -230,9 +230,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
230 size * (blocksize >> 9)); 230 size * (blocksize >> 9));
231 counter++; 231 counter++;
232 blk++; 232 blk++;
233 data = read_dev_sector(bdev, 233 data = read_part_sector(state,
234 blk * (blocksize/512), 234 blk * (blocksize/512), &sect);
235 &sect);
236 } 235 }
237 236
238 if (!data) 237 if (!data)
diff --git a/fs/partitions/ibm.h b/fs/partitions/ibm.h
index 31f85a6ac459..08fb0804a812 100644
--- a/fs/partitions/ibm.h
+++ b/fs/partitions/ibm.h
@@ -1 +1 @@
int ibm_partition(struct parsed_partitions *, struct block_device *); int ibm_partition(struct parsed_partitions *);
diff --git a/fs/partitions/karma.c b/fs/partitions/karma.c
index 176d89bcf123..1cc928bb762f 100644
--- a/fs/partitions/karma.c
+++ b/fs/partitions/karma.c
@@ -9,7 +9,7 @@
9#include "check.h" 9#include "check.h"
10#include "karma.h" 10#include "karma.h"
11 11
12int karma_partition(struct parsed_partitions *state, struct block_device *bdev) 12int karma_partition(struct parsed_partitions *state)
13{ 13{
14 int i; 14 int i;
15 int slot = 1; 15 int slot = 1;
@@ -29,7 +29,7 @@ int karma_partition(struct parsed_partitions *state, struct block_device *bdev)
29 } __attribute__((packed)) *label; 29 } __attribute__((packed)) *label;
30 struct d_partition *p; 30 struct d_partition *p;
31 31
32 data = read_dev_sector(bdev, 0, &sect); 32 data = read_part_sector(state, 0, &sect);
33 if (!data) 33 if (!data)
34 return -1; 34 return -1;
35 35
diff --git a/fs/partitions/karma.h b/fs/partitions/karma.h
index ecf7d3f2a3d8..c764b2e9df21 100644
--- a/fs/partitions/karma.h
+++ b/fs/partitions/karma.h
@@ -4,5 +4,5 @@
4 4
5#define KARMA_LABEL_MAGIC 0xAB56 5#define KARMA_LABEL_MAGIC 0xAB56
6 6
7int karma_partition(struct parsed_partitions *state, struct block_device *bdev); 7int karma_partition(struct parsed_partitions *state);
8 8
diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c
index 8652fb99e962..648c9d8f3357 100644
--- a/fs/partitions/ldm.c
+++ b/fs/partitions/ldm.c
@@ -26,6 +26,7 @@
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/pagemap.h> 27#include <linux/pagemap.h>
28#include <linux/stringify.h> 28#include <linux/stringify.h>
29#include <linux/kernel.h>
29#include "ldm.h" 30#include "ldm.h"
30#include "check.h" 31#include "check.h"
31#include "msdos.h" 32#include "msdos.h"
@@ -77,17 +78,16 @@ static int ldm_parse_hexbyte (const u8 *src)
77 int h; 78 int h;
78 79
79 /* high part */ 80 /* high part */
80 if ((x = src[0] - '0') <= '9'-'0') h = x; 81 x = h = hex_to_bin(src[0]);
81 else if ((x = src[0] - 'a') <= 'f'-'a') h = x+10; 82 if (h < 0)
82 else if ((x = src[0] - 'A') <= 'F'-'A') h = x+10; 83 return -1;
83 else return -1;
84 h <<= 4;
85 84
86 /* low part */ 85 /* low part */
87 if ((x = src[1] - '0') <= '9'-'0') return h | x; 86 h = hex_to_bin(src[1]);
88 if ((x = src[1] - 'a') <= 'f'-'a') return h | (x+10); 87 if (h < 0)
89 if ((x = src[1] - 'A') <= 'F'-'A') return h | (x+10); 88 return -1;
90 return -1; 89
90 return (x << 4) + h;
91} 91}
92 92
93/** 93/**
@@ -309,7 +309,7 @@ static bool ldm_compare_tocblocks (const struct tocblock *toc1,
309 309
310/** 310/**
311 * ldm_validate_privheads - Compare the primary privhead with its backups 311 * ldm_validate_privheads - Compare the primary privhead with its backups
312 * @bdev: Device holding the LDM Database 312 * @state: Partition check state including device holding the LDM Database
313 * @ph1: Memory struct to fill with ph contents 313 * @ph1: Memory struct to fill with ph contents
314 * 314 *
315 * Read and compare all three privheads from disk. 315 * Read and compare all three privheads from disk.
@@ -321,8 +321,8 @@ static bool ldm_compare_tocblocks (const struct tocblock *toc1,
321 * Return: 'true' Success 321 * Return: 'true' Success
322 * 'false' Error 322 * 'false' Error
323 */ 323 */
324static bool ldm_validate_privheads (struct block_device *bdev, 324static bool ldm_validate_privheads(struct parsed_partitions *state,
325 struct privhead *ph1) 325 struct privhead *ph1)
326{ 326{
327 static const int off[3] = { OFF_PRIV1, OFF_PRIV2, OFF_PRIV3 }; 327 static const int off[3] = { OFF_PRIV1, OFF_PRIV2, OFF_PRIV3 };
328 struct privhead *ph[3] = { ph1 }; 328 struct privhead *ph[3] = { ph1 };
@@ -332,7 +332,7 @@ static bool ldm_validate_privheads (struct block_device *bdev,
332 long num_sects; 332 long num_sects;
333 int i; 333 int i;
334 334
335 BUG_ON (!bdev || !ph1); 335 BUG_ON (!state || !ph1);
336 336
337 ph[1] = kmalloc (sizeof (*ph[1]), GFP_KERNEL); 337 ph[1] = kmalloc (sizeof (*ph[1]), GFP_KERNEL);
338 ph[2] = kmalloc (sizeof (*ph[2]), GFP_KERNEL); 338 ph[2] = kmalloc (sizeof (*ph[2]), GFP_KERNEL);
@@ -346,8 +346,8 @@ static bool ldm_validate_privheads (struct block_device *bdev,
346 346
347 /* Read and parse privheads */ 347 /* Read and parse privheads */
348 for (i = 0; i < 3; i++) { 348 for (i = 0; i < 3; i++) {
349 data = read_dev_sector (bdev, 349 data = read_part_sector(state, ph[0]->config_start + off[i],
350 ph[0]->config_start + off[i], &sect); 350 &sect);
351 if (!data) { 351 if (!data) {
352 ldm_crit ("Disk read failed."); 352 ldm_crit ("Disk read failed.");
353 goto out; 353 goto out;
@@ -363,7 +363,7 @@ static bool ldm_validate_privheads (struct block_device *bdev,
363 } 363 }
364 } 364 }
365 365
366 num_sects = bdev->bd_inode->i_size >> 9; 366 num_sects = state->bdev->bd_inode->i_size >> 9;
367 367
368 if ((ph[0]->config_start > num_sects) || 368 if ((ph[0]->config_start > num_sects) ||
369 ((ph[0]->config_start + ph[0]->config_size) > num_sects)) { 369 ((ph[0]->config_start + ph[0]->config_size) > num_sects)) {
@@ -397,20 +397,20 @@ out:
397 397
398/** 398/**
399 * ldm_validate_tocblocks - Validate the table of contents and its backups 399 * ldm_validate_tocblocks - Validate the table of contents and its backups
400 * @bdev: Device holding the LDM Database 400 * @state: Partition check state including device holding the LDM Database
401 * @base: Offset, into @bdev, of the database 401 * @base: Offset, into @state->bdev, of the database
402 * @ldb: Cache of the database structures 402 * @ldb: Cache of the database structures
403 * 403 *
404 * Find and compare the four tables of contents of the LDM Database stored on 404 * Find and compare the four tables of contents of the LDM Database stored on
405 * @bdev and return the parsed information into @toc1. 405 * @state->bdev and return the parsed information into @toc1.
406 * 406 *
407 * The offsets and sizes of the configs are range-checked against a privhead. 407 * The offsets and sizes of the configs are range-checked against a privhead.
408 * 408 *
409 * Return: 'true' @toc1 contains validated TOCBLOCK info 409 * Return: 'true' @toc1 contains validated TOCBLOCK info
410 * 'false' @toc1 contents are undefined 410 * 'false' @toc1 contents are undefined
411 */ 411 */
412static bool ldm_validate_tocblocks(struct block_device *bdev, 412static bool ldm_validate_tocblocks(struct parsed_partitions *state,
413 unsigned long base, struct ldmdb *ldb) 413 unsigned long base, struct ldmdb *ldb)
414{ 414{
415 static const int off[4] = { OFF_TOCB1, OFF_TOCB2, OFF_TOCB3, OFF_TOCB4}; 415 static const int off[4] = { OFF_TOCB1, OFF_TOCB2, OFF_TOCB3, OFF_TOCB4};
416 struct tocblock *tb[4]; 416 struct tocblock *tb[4];
@@ -420,7 +420,7 @@ static bool ldm_validate_tocblocks(struct block_device *bdev,
420 int i, nr_tbs; 420 int i, nr_tbs;
421 bool result = false; 421 bool result = false;
422 422
423 BUG_ON(!bdev || !ldb); 423 BUG_ON(!state || !ldb);
424 ph = &ldb->ph; 424 ph = &ldb->ph;
425 tb[0] = &ldb->toc; 425 tb[0] = &ldb->toc;
426 tb[1] = kmalloc(sizeof(*tb[1]) * 3, GFP_KERNEL); 426 tb[1] = kmalloc(sizeof(*tb[1]) * 3, GFP_KERNEL);
@@ -437,7 +437,7 @@ static bool ldm_validate_tocblocks(struct block_device *bdev,
437 * skip any that fail as long as we get at least one valid TOCBLOCK. 437 * skip any that fail as long as we get at least one valid TOCBLOCK.
438 */ 438 */
439 for (nr_tbs = i = 0; i < 4; i++) { 439 for (nr_tbs = i = 0; i < 4; i++) {
440 data = read_dev_sector(bdev, base + off[i], &sect); 440 data = read_part_sector(state, base + off[i], &sect);
441 if (!data) { 441 if (!data) {
442 ldm_error("Disk read failed for TOCBLOCK %d.", i); 442 ldm_error("Disk read failed for TOCBLOCK %d.", i);
443 continue; 443 continue;
@@ -473,7 +473,7 @@ err:
473 473
474/** 474/**
475 * ldm_validate_vmdb - Read the VMDB and validate it 475 * ldm_validate_vmdb - Read the VMDB and validate it
476 * @bdev: Device holding the LDM Database 476 * @state: Partition check state including device holding the LDM Database
477 * @base: Offset, into @bdev, of the database 477 * @base: Offset, into @bdev, of the database
478 * @ldb: Cache of the database structures 478 * @ldb: Cache of the database structures
479 * 479 *
@@ -483,8 +483,8 @@ err:
483 * Return: 'true' @ldb contains validated VBDB info 483 * Return: 'true' @ldb contains validated VBDB info
484 * 'false' @ldb contents are undefined 484 * 'false' @ldb contents are undefined
485 */ 485 */
486static bool ldm_validate_vmdb (struct block_device *bdev, unsigned long base, 486static bool ldm_validate_vmdb(struct parsed_partitions *state,
487 struct ldmdb *ldb) 487 unsigned long base, struct ldmdb *ldb)
488{ 488{
489 Sector sect; 489 Sector sect;
490 u8 *data; 490 u8 *data;
@@ -492,12 +492,12 @@ static bool ldm_validate_vmdb (struct block_device *bdev, unsigned long base,
492 struct vmdb *vm; 492 struct vmdb *vm;
493 struct tocblock *toc; 493 struct tocblock *toc;
494 494
495 BUG_ON (!bdev || !ldb); 495 BUG_ON (!state || !ldb);
496 496
497 vm = &ldb->vm; 497 vm = &ldb->vm;
498 toc = &ldb->toc; 498 toc = &ldb->toc;
499 499
500 data = read_dev_sector (bdev, base + OFF_VMDB, &sect); 500 data = read_part_sector(state, base + OFF_VMDB, &sect);
501 if (!data) { 501 if (!data) {
502 ldm_crit ("Disk read failed."); 502 ldm_crit ("Disk read failed.");
503 return false; 503 return false;
@@ -534,21 +534,21 @@ out:
534 534
535/** 535/**
536 * ldm_validate_partition_table - Determine whether bdev might be a dynamic disk 536 * ldm_validate_partition_table - Determine whether bdev might be a dynamic disk
537 * @bdev: Device holding the LDM Database 537 * @state: Partition check state including device holding the LDM Database
538 * 538 *
539 * This function provides a weak test to decide whether the device is a dynamic 539 * This function provides a weak test to decide whether the device is a dynamic
540 * disk or not. It looks for an MS-DOS-style partition table containing at 540 * disk or not. It looks for an MS-DOS-style partition table containing at
541 * least one partition of type 0x42 (formerly SFS, now used by Windows for 541 * least one partition of type 0x42 (formerly SFS, now used by Windows for
542 * dynamic disks). 542 * dynamic disks).
543 * 543 *
544 * N.B. The only possible error can come from the read_dev_sector and that is 544 * N.B. The only possible error can come from the read_part_sector and that is
545 * only likely to happen if the underlying device is strange. If that IS 545 * only likely to happen if the underlying device is strange. If that IS
546 * the case we should return zero to let someone else try. 546 * the case we should return zero to let someone else try.
547 * 547 *
548 * Return: 'true' @bdev is a dynamic disk 548 * Return: 'true' @state->bdev is a dynamic disk
549 * 'false' @bdev is not a dynamic disk, or an error occurred 549 * 'false' @state->bdev is not a dynamic disk, or an error occurred
550 */ 550 */
551static bool ldm_validate_partition_table (struct block_device *bdev) 551static bool ldm_validate_partition_table(struct parsed_partitions *state)
552{ 552{
553 Sector sect; 553 Sector sect;
554 u8 *data; 554 u8 *data;
@@ -556,9 +556,9 @@ static bool ldm_validate_partition_table (struct block_device *bdev)
556 int i; 556 int i;
557 bool result = false; 557 bool result = false;
558 558
559 BUG_ON (!bdev); 559 BUG_ON(!state);
560 560
561 data = read_dev_sector (bdev, 0, &sect); 561 data = read_part_sector(state, 0, &sect);
562 if (!data) { 562 if (!data) {
563 ldm_crit ("Disk read failed."); 563 ldm_crit ("Disk read failed.");
564 return false; 564 return false;
@@ -1391,8 +1391,8 @@ static bool ldm_frag_commit (struct list_head *frags, struct ldmdb *ldb)
1391 1391
1392/** 1392/**
1393 * ldm_get_vblks - Read the on-disk database of VBLKs into memory 1393 * ldm_get_vblks - Read the on-disk database of VBLKs into memory
1394 * @bdev: Device holding the LDM Database 1394 * @state: Partition check state including device holding the LDM Database
1395 * @base: Offset, into @bdev, of the database 1395 * @base: Offset, into @state->bdev, of the database
1396 * @ldb: Cache of the database structures 1396 * @ldb: Cache of the database structures
1397 * 1397 *
1398 * To use the information from the VBLKs, they need to be read from the disk, 1398 * To use the information from the VBLKs, they need to be read from the disk,
@@ -1401,8 +1401,8 @@ static bool ldm_frag_commit (struct list_head *frags, struct ldmdb *ldb)
1401 * Return: 'true' All the VBLKs were read successfully 1401 * Return: 'true' All the VBLKs were read successfully
1402 * 'false' An error occurred 1402 * 'false' An error occurred
1403 */ 1403 */
1404static bool ldm_get_vblks (struct block_device *bdev, unsigned long base, 1404static bool ldm_get_vblks(struct parsed_partitions *state, unsigned long base,
1405 struct ldmdb *ldb) 1405 struct ldmdb *ldb)
1406{ 1406{
1407 int size, perbuf, skip, finish, s, v, recs; 1407 int size, perbuf, skip, finish, s, v, recs;
1408 u8 *data = NULL; 1408 u8 *data = NULL;
@@ -1410,7 +1410,7 @@ static bool ldm_get_vblks (struct block_device *bdev, unsigned long base,
1410 bool result = false; 1410 bool result = false;
1411 LIST_HEAD (frags); 1411 LIST_HEAD (frags);
1412 1412
1413 BUG_ON (!bdev || !ldb); 1413 BUG_ON(!state || !ldb);
1414 1414
1415 size = ldb->vm.vblk_size; 1415 size = ldb->vm.vblk_size;
1416 perbuf = 512 / size; 1416 perbuf = 512 / size;
@@ -1418,7 +1418,7 @@ static bool ldm_get_vblks (struct block_device *bdev, unsigned long base,
1418 finish = (size * ldb->vm.last_vblk_seq) >> 9; 1418 finish = (size * ldb->vm.last_vblk_seq) >> 9;
1419 1419
1420 for (s = skip; s < finish; s++) { /* For each sector */ 1420 for (s = skip; s < finish; s++) { /* For each sector */
1421 data = read_dev_sector (bdev, base + OFF_VMDB + s, &sect); 1421 data = read_part_sector(state, base + OFF_VMDB + s, &sect);
1422 if (!data) { 1422 if (!data) {
1423 ldm_crit ("Disk read failed."); 1423 ldm_crit ("Disk read failed.");
1424 goto out; 1424 goto out;
@@ -1474,8 +1474,7 @@ static void ldm_free_vblks (struct list_head *lh)
1474 1474
1475/** 1475/**
1476 * ldm_partition - Find out whether a device is a dynamic disk and handle it 1476 * ldm_partition - Find out whether a device is a dynamic disk and handle it
1477 * @pp: List of the partitions parsed so far 1477 * @state: Partition check state including device holding the LDM Database
1478 * @bdev: Device holding the LDM Database
1479 * 1478 *
1480 * This determines whether the device @bdev is a dynamic disk and if so creates 1479 * This determines whether the device @bdev is a dynamic disk and if so creates
1481 * the partitions necessary in the gendisk structure pointed to by @hd. 1480 * the partitions necessary in the gendisk structure pointed to by @hd.
@@ -1485,21 +1484,21 @@ static void ldm_free_vblks (struct list_head *lh)
1485 * example, if the device is hda, we would have: hda1: LDM database, hda2, hda3, 1484 * example, if the device is hda, we would have: hda1: LDM database, hda2, hda3,
1486 * and so on: the actual data containing partitions. 1485 * and so on: the actual data containing partitions.
1487 * 1486 *
1488 * Return: 1 Success, @bdev is a dynamic disk and we handled it 1487 * Return: 1 Success, @state->bdev is a dynamic disk and we handled it
1489 * 0 Success, @bdev is not a dynamic disk 1488 * 0 Success, @state->bdev is not a dynamic disk
1490 * -1 An error occurred before enough information had been read 1489 * -1 An error occurred before enough information had been read
1491 * Or @bdev is a dynamic disk, but it may be corrupted 1490 * Or @state->bdev is a dynamic disk, but it may be corrupted
1492 */ 1491 */
1493int ldm_partition (struct parsed_partitions *pp, struct block_device *bdev) 1492int ldm_partition(struct parsed_partitions *state)
1494{ 1493{
1495 struct ldmdb *ldb; 1494 struct ldmdb *ldb;
1496 unsigned long base; 1495 unsigned long base;
1497 int result = -1; 1496 int result = -1;
1498 1497
1499 BUG_ON (!pp || !bdev); 1498 BUG_ON(!state);
1500 1499
1501 /* Look for signs of a Dynamic Disk */ 1500 /* Look for signs of a Dynamic Disk */
1502 if (!ldm_validate_partition_table (bdev)) 1501 if (!ldm_validate_partition_table(state))
1503 return 0; 1502 return 0;
1504 1503
1505 ldb = kmalloc (sizeof (*ldb), GFP_KERNEL); 1504 ldb = kmalloc (sizeof (*ldb), GFP_KERNEL);
@@ -1509,15 +1508,15 @@ int ldm_partition (struct parsed_partitions *pp, struct block_device *bdev)
1509 } 1508 }
1510 1509
1511 /* Parse and check privheads. */ 1510 /* Parse and check privheads. */
1512 if (!ldm_validate_privheads (bdev, &ldb->ph)) 1511 if (!ldm_validate_privheads(state, &ldb->ph))
1513 goto out; /* Already logged */ 1512 goto out; /* Already logged */
1514 1513
1515 /* All further references are relative to base (database start). */ 1514 /* All further references are relative to base (database start). */
1516 base = ldb->ph.config_start; 1515 base = ldb->ph.config_start;
1517 1516
1518 /* Parse and check tocs and vmdb. */ 1517 /* Parse and check tocs and vmdb. */
1519 if (!ldm_validate_tocblocks (bdev, base, ldb) || 1518 if (!ldm_validate_tocblocks(state, base, ldb) ||
1520 !ldm_validate_vmdb (bdev, base, ldb)) 1519 !ldm_validate_vmdb(state, base, ldb))
1521 goto out; /* Already logged */ 1520 goto out; /* Already logged */
1522 1521
1523 /* Initialize vblk lists in ldmdb struct */ 1522 /* Initialize vblk lists in ldmdb struct */
@@ -1527,13 +1526,13 @@ int ldm_partition (struct parsed_partitions *pp, struct block_device *bdev)
1527 INIT_LIST_HEAD (&ldb->v_comp); 1526 INIT_LIST_HEAD (&ldb->v_comp);
1528 INIT_LIST_HEAD (&ldb->v_part); 1527 INIT_LIST_HEAD (&ldb->v_part);
1529 1528
1530 if (!ldm_get_vblks (bdev, base, ldb)) { 1529 if (!ldm_get_vblks(state, base, ldb)) {
1531 ldm_crit ("Failed to read the VBLKs from the database."); 1530 ldm_crit ("Failed to read the VBLKs from the database.");
1532 goto cleanup; 1531 goto cleanup;
1533 } 1532 }
1534 1533
1535 /* Finally, create the data partition devices. */ 1534 /* Finally, create the data partition devices. */
1536 if (ldm_create_data_partitions (pp, ldb)) { 1535 if (ldm_create_data_partitions(state, ldb)) {
1537 ldm_debug ("Parsed LDM database successfully."); 1536 ldm_debug ("Parsed LDM database successfully.");
1538 result = 1; 1537 result = 1;
1539 } 1538 }
diff --git a/fs/partitions/ldm.h b/fs/partitions/ldm.h
index 30e08e809c1d..d1fb50b28d86 100644
--- a/fs/partitions/ldm.h
+++ b/fs/partitions/ldm.h
@@ -209,7 +209,7 @@ struct ldmdb { /* Cache of the database */
209 struct list_head v_part; 209 struct list_head v_part;
210}; 210};
211 211
212int ldm_partition (struct parsed_partitions *state, struct block_device *bdev); 212int ldm_partition(struct parsed_partitions *state);
213 213
214#endif /* _FS_PT_LDM_H_ */ 214#endif /* _FS_PT_LDM_H_ */
215 215
diff --git a/fs/partitions/mac.c b/fs/partitions/mac.c
index d4a0fad3563b..74465ff7c263 100644
--- a/fs/partitions/mac.c
+++ b/fs/partitions/mac.c
@@ -27,7 +27,7 @@ static inline void mac_fix_string(char *stg, int len)
27 stg[i] = 0; 27 stg[i] = 0;
28} 28}
29 29
30int mac_partition(struct parsed_partitions *state, struct block_device *bdev) 30int mac_partition(struct parsed_partitions *state)
31{ 31{
32 int slot = 1; 32 int slot = 1;
33 Sector sect; 33 Sector sect;
@@ -42,7 +42,7 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
42 struct mac_driver_desc *md; 42 struct mac_driver_desc *md;
43 43
44 /* Get 0th block and look at the first partition map entry. */ 44 /* Get 0th block and look at the first partition map entry. */
45 md = (struct mac_driver_desc *) read_dev_sector(bdev, 0, &sect); 45 md = read_part_sector(state, 0, &sect);
46 if (!md) 46 if (!md)
47 return -1; 47 return -1;
48 if (be16_to_cpu(md->signature) != MAC_DRIVER_MAGIC) { 48 if (be16_to_cpu(md->signature) != MAC_DRIVER_MAGIC) {
@@ -51,7 +51,7 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
51 } 51 }
52 secsize = be16_to_cpu(md->block_size); 52 secsize = be16_to_cpu(md->block_size);
53 put_dev_sector(sect); 53 put_dev_sector(sect);
54 data = read_dev_sector(bdev, secsize/512, &sect); 54 data = read_part_sector(state, secsize/512, &sect);
55 if (!data) 55 if (!data)
56 return -1; 56 return -1;
57 part = (struct mac_partition *) (data + secsize%512); 57 part = (struct mac_partition *) (data + secsize%512);
@@ -64,7 +64,7 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
64 for (blk = 1; blk <= blocks_in_map; ++blk) { 64 for (blk = 1; blk <= blocks_in_map; ++blk) {
65 int pos = blk * secsize; 65 int pos = blk * secsize;
66 put_dev_sector(sect); 66 put_dev_sector(sect);
67 data = read_dev_sector(bdev, pos/512, &sect); 67 data = read_part_sector(state, pos/512, &sect);
68 if (!data) 68 if (!data)
69 return -1; 69 return -1;
70 part = (struct mac_partition *) (data + pos%512); 70 part = (struct mac_partition *) (data + pos%512);
@@ -75,7 +75,7 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
75 be32_to_cpu(part->block_count) * (secsize/512)); 75 be32_to_cpu(part->block_count) * (secsize/512));
76 76
77 if (!strnicmp(part->type, "Linux_RAID", 10)) 77 if (!strnicmp(part->type, "Linux_RAID", 10))
78 state->parts[slot].flags = 1; 78 state->parts[slot].flags = ADDPART_FLAG_RAID;
79#ifdef CONFIG_PPC_PMAC 79#ifdef CONFIG_PPC_PMAC
80 /* 80 /*
81 * If this is the first bootable partition, tell the 81 * If this is the first bootable partition, tell the
@@ -123,7 +123,8 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
123 } 123 }
124#ifdef CONFIG_PPC_PMAC 124#ifdef CONFIG_PPC_PMAC
125 if (found_root_goodness) 125 if (found_root_goodness)
126 note_bootable_part(bdev->bd_dev, found_root, found_root_goodness); 126 note_bootable_part(state->bdev->bd_dev, found_root,
127 found_root_goodness);
127#endif 128#endif
128 129
129 put_dev_sector(sect); 130 put_dev_sector(sect);
diff --git a/fs/partitions/mac.h b/fs/partitions/mac.h
index bbf26e1386fa..3c7d98436380 100644
--- a/fs/partitions/mac.h
+++ b/fs/partitions/mac.h
@@ -41,4 +41,4 @@ struct mac_driver_desc {
41 /* ... more stuff */ 41 /* ... more stuff */
42}; 42};
43 43
44int mac_partition(struct parsed_partitions *state, struct block_device *bdev); 44int mac_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/msdos.c b/fs/partitions/msdos.c
index 0028d2ef0662..15bfb7b1e044 100644
--- a/fs/partitions/msdos.c
+++ b/fs/partitions/msdos.c
@@ -31,14 +31,17 @@
31 */ 31 */
32#include <asm/unaligned.h> 32#include <asm/unaligned.h>
33 33
34#define SYS_IND(p) (get_unaligned(&p->sys_ind)) 34#define SYS_IND(p) get_unaligned(&p->sys_ind)
35#define NR_SECTS(p) ({ __le32 __a = get_unaligned(&p->nr_sects); \
36 le32_to_cpu(__a); \
37 })
38 35
39#define START_SECT(p) ({ __le32 __a = get_unaligned(&p->start_sect); \ 36static inline sector_t nr_sects(struct partition *p)
40 le32_to_cpu(__a); \ 37{
41 }) 38 return (sector_t)get_unaligned_le32(&p->nr_sects);
39}
40
41static inline sector_t start_sect(struct partition *p)
42{
43 return (sector_t)get_unaligned_le32(&p->start_sect);
44}
42 45
43static inline int is_extended_partition(struct partition *p) 46static inline int is_extended_partition(struct partition *p)
44{ 47{
@@ -61,7 +64,7 @@ msdos_magic_present(unsigned char *p)
61#define AIX_LABEL_MAGIC2 0xC2 64#define AIX_LABEL_MAGIC2 0xC2
62#define AIX_LABEL_MAGIC3 0xD4 65#define AIX_LABEL_MAGIC3 0xD4
63#define AIX_LABEL_MAGIC4 0xC1 66#define AIX_LABEL_MAGIC4 0xC1
64static int aix_magic_present(unsigned char *p, struct block_device *bdev) 67static int aix_magic_present(struct parsed_partitions *state, unsigned char *p)
65{ 68{
66 struct partition *pt = (struct partition *) (p + 0x1be); 69 struct partition *pt = (struct partition *) (p + 0x1be);
67 Sector sect; 70 Sector sect;
@@ -82,7 +85,7 @@ static int aix_magic_present(unsigned char *p, struct block_device *bdev)
82 is_extended_partition(pt)) 85 is_extended_partition(pt))
83 return 0; 86 return 0;
84 } 87 }
85 d = read_dev_sector(bdev, 7, &sect); 88 d = read_part_sector(state, 7, &sect);
86 if (d) { 89 if (d) {
87 if (d[0] == '_' && d[1] == 'L' && d[2] == 'V' && d[3] == 'M') 90 if (d[0] == '_' && d[1] == 'L' && d[2] == 'V' && d[3] == 'M')
88 ret = 1; 91 ret = 1;
@@ -102,15 +105,14 @@ static int aix_magic_present(unsigned char *p, struct block_device *bdev)
102 * only for the actual data partitions. 105 * only for the actual data partitions.
103 */ 106 */
104 107
105static void 108static void parse_extended(struct parsed_partitions *state,
106parse_extended(struct parsed_partitions *state, struct block_device *bdev, 109 sector_t first_sector, sector_t first_size)
107 u32 first_sector, u32 first_size)
108{ 110{
109 struct partition *p; 111 struct partition *p;
110 Sector sect; 112 Sector sect;
111 unsigned char *data; 113 unsigned char *data;
112 u32 this_sector, this_size; 114 sector_t this_sector, this_size;
113 int sector_size = bdev_logical_block_size(bdev) / 512; 115 sector_t sector_size = bdev_logical_block_size(state->bdev) / 512;
114 int loopct = 0; /* number of links followed 116 int loopct = 0; /* number of links followed
115 without finding a data partition */ 117 without finding a data partition */
116 int i; 118 int i;
@@ -123,7 +125,7 @@ parse_extended(struct parsed_partitions *state, struct block_device *bdev,
123 return; 125 return;
124 if (state->next == state->limit) 126 if (state->next == state->limit)
125 return; 127 return;
126 data = read_dev_sector(bdev, this_sector, &sect); 128 data = read_part_sector(state, this_sector, &sect);
127 if (!data) 129 if (!data)
128 return; 130 return;
129 131
@@ -145,14 +147,14 @@ parse_extended(struct parsed_partitions *state, struct block_device *bdev,
145 * First process the data partition(s) 147 * First process the data partition(s)
146 */ 148 */
147 for (i=0; i<4; i++, p++) { 149 for (i=0; i<4; i++, p++) {
148 u32 offs, size, next; 150 sector_t offs, size, next;
149 if (!NR_SECTS(p) || is_extended_partition(p)) 151 if (!nr_sects(p) || is_extended_partition(p))
150 continue; 152 continue;
151 153
152 /* Check the 3rd and 4th entries - 154 /* Check the 3rd and 4th entries -
153 these sometimes contain random garbage */ 155 these sometimes contain random garbage */
154 offs = START_SECT(p)*sector_size; 156 offs = start_sect(p)*sector_size;
155 size = NR_SECTS(p)*sector_size; 157 size = nr_sects(p)*sector_size;
156 next = this_sector + offs; 158 next = this_sector + offs;
157 if (i >= 2) { 159 if (i >= 2) {
158 if (offs + size > this_size) 160 if (offs + size > this_size)
@@ -179,13 +181,13 @@ parse_extended(struct parsed_partitions *state, struct block_device *bdev,
179 */ 181 */
180 p -= 4; 182 p -= 4;
181 for (i=0; i<4; i++, p++) 183 for (i=0; i<4; i++, p++)
182 if (NR_SECTS(p) && is_extended_partition(p)) 184 if (nr_sects(p) && is_extended_partition(p))
183 break; 185 break;
184 if (i == 4) 186 if (i == 4)
185 goto done; /* nothing left to do */ 187 goto done; /* nothing left to do */
186 188
187 this_sector = first_sector + START_SECT(p) * sector_size; 189 this_sector = first_sector + start_sect(p) * sector_size;
188 this_size = NR_SECTS(p) * sector_size; 190 this_size = nr_sects(p) * sector_size;
189 put_dev_sector(sect); 191 put_dev_sector(sect);
190 } 192 }
191done: 193done:
@@ -195,9 +197,8 @@ done:
195/* james@bpgc.com: Solaris has a nasty indicator: 0x82 which also 197/* james@bpgc.com: Solaris has a nasty indicator: 0x82 which also
196 indicates linux swap. Be careful before believing this is Solaris. */ 198 indicates linux swap. Be careful before believing this is Solaris. */
197 199
198static void 200static void parse_solaris_x86(struct parsed_partitions *state,
199parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev, 201 sector_t offset, sector_t size, int origin)
200 u32 offset, u32 size, int origin)
201{ 202{
202#ifdef CONFIG_SOLARIS_X86_PARTITION 203#ifdef CONFIG_SOLARIS_X86_PARTITION
203 Sector sect; 204 Sector sect;
@@ -205,7 +206,7 @@ parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev,
205 int i; 206 int i;
206 short max_nparts; 207 short max_nparts;
207 208
208 v = (struct solaris_x86_vtoc *)read_dev_sector(bdev, offset+1, &sect); 209 v = read_part_sector(state, offset + 1, &sect);
209 if (!v) 210 if (!v)
210 return; 211 return;
211 if (le32_to_cpu(v->v_sanity) != SOLARIS_X86_VTOC_SANE) { 212 if (le32_to_cpu(v->v_sanity) != SOLARIS_X86_VTOC_SANE) {
@@ -242,16 +243,15 @@ parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev,
242 * Create devices for BSD partitions listed in a disklabel, under a 243 * Create devices for BSD partitions listed in a disklabel, under a
243 * dos-like partition. See parse_extended() for more information. 244 * dos-like partition. See parse_extended() for more information.
244 */ 245 */
245static void 246static void parse_bsd(struct parsed_partitions *state,
246parse_bsd(struct parsed_partitions *state, struct block_device *bdev, 247 sector_t offset, sector_t size, int origin, char *flavour,
247 u32 offset, u32 size, int origin, char *flavour, 248 int max_partitions)
248 int max_partitions)
249{ 249{
250 Sector sect; 250 Sector sect;
251 struct bsd_disklabel *l; 251 struct bsd_disklabel *l;
252 struct bsd_partition *p; 252 struct bsd_partition *p;
253 253
254 l = (struct bsd_disklabel *)read_dev_sector(bdev, offset+1, &sect); 254 l = read_part_sector(state, offset + 1, &sect);
255 if (!l) 255 if (!l)
256 return; 256 return;
257 if (le32_to_cpu(l->d_magic) != BSD_DISKMAGIC) { 257 if (le32_to_cpu(l->d_magic) != BSD_DISKMAGIC) {
@@ -263,7 +263,7 @@ parse_bsd(struct parsed_partitions *state, struct block_device *bdev,
263 if (le16_to_cpu(l->d_npartitions) < max_partitions) 263 if (le16_to_cpu(l->d_npartitions) < max_partitions)
264 max_partitions = le16_to_cpu(l->d_npartitions); 264 max_partitions = le16_to_cpu(l->d_npartitions);
265 for (p = l->d_partitions; p - l->d_partitions < max_partitions; p++) { 265 for (p = l->d_partitions; p - l->d_partitions < max_partitions; p++) {
266 u32 bsd_start, bsd_size; 266 sector_t bsd_start, bsd_size;
267 267
268 if (state->next == state->limit) 268 if (state->next == state->limit)
269 break; 269 break;
@@ -288,33 +288,28 @@ parse_bsd(struct parsed_partitions *state, struct block_device *bdev,
288} 288}
289#endif 289#endif
290 290
291static void 291static void parse_freebsd(struct parsed_partitions *state,
292parse_freebsd(struct parsed_partitions *state, struct block_device *bdev, 292 sector_t offset, sector_t size, int origin)
293 u32 offset, u32 size, int origin)
294{ 293{
295#ifdef CONFIG_BSD_DISKLABEL 294#ifdef CONFIG_BSD_DISKLABEL
296 parse_bsd(state, bdev, offset, size, origin, 295 parse_bsd(state, offset, size, origin, "bsd", BSD_MAXPARTITIONS);
297 "bsd", BSD_MAXPARTITIONS);
298#endif 296#endif
299} 297}
300 298
301static void 299static void parse_netbsd(struct parsed_partitions *state,
302parse_netbsd(struct parsed_partitions *state, struct block_device *bdev, 300 sector_t offset, sector_t size, int origin)
303 u32 offset, u32 size, int origin)
304{ 301{
305#ifdef CONFIG_BSD_DISKLABEL 302#ifdef CONFIG_BSD_DISKLABEL
306 parse_bsd(state, bdev, offset, size, origin, 303 parse_bsd(state, offset, size, origin, "netbsd", BSD_MAXPARTITIONS);
307 "netbsd", BSD_MAXPARTITIONS);
308#endif 304#endif
309} 305}
310 306
311static void 307static void parse_openbsd(struct parsed_partitions *state,
312parse_openbsd(struct parsed_partitions *state, struct block_device *bdev, 308 sector_t offset, sector_t size, int origin)
313 u32 offset, u32 size, int origin)
314{ 309{
315#ifdef CONFIG_BSD_DISKLABEL 310#ifdef CONFIG_BSD_DISKLABEL
316 parse_bsd(state, bdev, offset, size, origin, 311 parse_bsd(state, offset, size, origin, "openbsd",
317 "openbsd", OPENBSD_MAXPARTITIONS); 312 OPENBSD_MAXPARTITIONS);
318#endif 313#endif
319} 314}
320 315
@@ -322,16 +317,15 @@ parse_openbsd(struct parsed_partitions *state, struct block_device *bdev,
322 * Create devices for Unixware partitions listed in a disklabel, under a 317 * Create devices for Unixware partitions listed in a disklabel, under a
323 * dos-like partition. See parse_extended() for more information. 318 * dos-like partition. See parse_extended() for more information.
324 */ 319 */
325static void 320static void parse_unixware(struct parsed_partitions *state,
326parse_unixware(struct parsed_partitions *state, struct block_device *bdev, 321 sector_t offset, sector_t size, int origin)
327 u32 offset, u32 size, int origin)
328{ 322{
329#ifdef CONFIG_UNIXWARE_DISKLABEL 323#ifdef CONFIG_UNIXWARE_DISKLABEL
330 Sector sect; 324 Sector sect;
331 struct unixware_disklabel *l; 325 struct unixware_disklabel *l;
332 struct unixware_slice *p; 326 struct unixware_slice *p;
333 327
334 l = (struct unixware_disklabel *)read_dev_sector(bdev, offset+29, &sect); 328 l = read_part_sector(state, offset + 29, &sect);
335 if (!l) 329 if (!l)
336 return; 330 return;
337 if (le32_to_cpu(l->d_magic) != UNIXWARE_DISKMAGIC || 331 if (le32_to_cpu(l->d_magic) != UNIXWARE_DISKMAGIC ||
@@ -348,7 +342,8 @@ parse_unixware(struct parsed_partitions *state, struct block_device *bdev,
348 342
349 if (p->s_label != UNIXWARE_FS_UNUSED) 343 if (p->s_label != UNIXWARE_FS_UNUSED)
350 put_partition(state, state->next++, 344 put_partition(state, state->next++,
351 START_SECT(p), NR_SECTS(p)); 345 le32_to_cpu(p->start_sect),
346 le32_to_cpu(p->nr_sects));
352 p++; 347 p++;
353 } 348 }
354 put_dev_sector(sect); 349 put_dev_sector(sect);
@@ -361,9 +356,8 @@ parse_unixware(struct parsed_partitions *state, struct block_device *bdev,
361 * Anand Krishnamurthy <anandk@wiproge.med.ge.com> 356 * Anand Krishnamurthy <anandk@wiproge.med.ge.com>
362 * Rajeev V. Pillai <rajeevvp@yahoo.com> 357 * Rajeev V. Pillai <rajeevvp@yahoo.com>
363 */ 358 */
364static void 359static void parse_minix(struct parsed_partitions *state,
365parse_minix(struct parsed_partitions *state, struct block_device *bdev, 360 sector_t offset, sector_t size, int origin)
366 u32 offset, u32 size, int origin)
367{ 361{
368#ifdef CONFIG_MINIX_SUBPARTITION 362#ifdef CONFIG_MINIX_SUBPARTITION
369 Sector sect; 363 Sector sect;
@@ -371,7 +365,7 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev,
371 struct partition *p; 365 struct partition *p;
372 int i; 366 int i;
373 367
374 data = read_dev_sector(bdev, offset, &sect); 368 data = read_part_sector(state, offset, &sect);
375 if (!data) 369 if (!data)
376 return; 370 return;
377 371
@@ -390,7 +384,7 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev,
390 /* add each partition in use */ 384 /* add each partition in use */
391 if (SYS_IND(p) == MINIX_PARTITION) 385 if (SYS_IND(p) == MINIX_PARTITION)
392 put_partition(state, state->next++, 386 put_partition(state, state->next++,
393 START_SECT(p), NR_SECTS(p)); 387 start_sect(p), nr_sects(p));
394 } 388 }
395 printk(" >\n"); 389 printk(" >\n");
396 } 390 }
@@ -400,8 +394,7 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev,
400 394
401static struct { 395static struct {
402 unsigned char id; 396 unsigned char id;
403 void (*parse)(struct parsed_partitions *, struct block_device *, 397 void (*parse)(struct parsed_partitions *, sector_t, sector_t, int);
404 u32, u32, int);
405} subtypes[] = { 398} subtypes[] = {
406 {FREEBSD_PARTITION, parse_freebsd}, 399 {FREEBSD_PARTITION, parse_freebsd},
407 {NETBSD_PARTITION, parse_netbsd}, 400 {NETBSD_PARTITION, parse_netbsd},
@@ -413,16 +406,16 @@ static struct {
413 {0, NULL}, 406 {0, NULL},
414}; 407};
415 408
416int msdos_partition(struct parsed_partitions *state, struct block_device *bdev) 409int msdos_partition(struct parsed_partitions *state)
417{ 410{
418 int sector_size = bdev_logical_block_size(bdev) / 512; 411 sector_t sector_size = bdev_logical_block_size(state->bdev) / 512;
419 Sector sect; 412 Sector sect;
420 unsigned char *data; 413 unsigned char *data;
421 struct partition *p; 414 struct partition *p;
422 struct fat_boot_sector *fb; 415 struct fat_boot_sector *fb;
423 int slot; 416 int slot;
424 417
425 data = read_dev_sector(bdev, 0, &sect); 418 data = read_part_sector(state, 0, &sect);
426 if (!data) 419 if (!data)
427 return -1; 420 return -1;
428 if (!msdos_magic_present(data + 510)) { 421 if (!msdos_magic_present(data + 510)) {
@@ -430,7 +423,7 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
430 return 0; 423 return 0;
431 } 424 }
432 425
433 if (aix_magic_present(data, bdev)) { 426 if (aix_magic_present(state, data)) {
434 put_dev_sector(sect); 427 put_dev_sector(sect);
435 printk( " [AIX]"); 428 printk( " [AIX]");
436 return 0; 429 return 0;
@@ -483,22 +476,29 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
483 476
484 state->next = 5; 477 state->next = 5;
485 for (slot = 1 ; slot <= 4 ; slot++, p++) { 478 for (slot = 1 ; slot <= 4 ; slot++, p++) {
486 u32 start = START_SECT(p)*sector_size; 479 sector_t start = start_sect(p)*sector_size;
487 u32 size = NR_SECTS(p)*sector_size; 480 sector_t size = nr_sects(p)*sector_size;
488 if (!size) 481 if (!size)
489 continue; 482 continue;
490 if (is_extended_partition(p)) { 483 if (is_extended_partition(p)) {
491 /* prevent someone doing mkfs or mkswap on an 484 /*
492 extended partition, but leave room for LILO */ 485 * prevent someone doing mkfs or mkswap on an
493 put_partition(state, slot, start, size == 1 ? 1 : 2); 486 * extended partition, but leave room for LILO
487 * FIXME: this uses one logical sector for > 512b
488 * sector, although it may not be enough/proper.
489 */
490 sector_t n = 2;
491 n = min(size, max(sector_size, n));
492 put_partition(state, slot, start, n);
493
494 printk(" <"); 494 printk(" <");
495 parse_extended(state, bdev, start, size); 495 parse_extended(state, start, size);
496 printk(" >"); 496 printk(" >");
497 continue; 497 continue;
498 } 498 }
499 put_partition(state, slot, start, size); 499 put_partition(state, slot, start, size);
500 if (SYS_IND(p) == LINUX_RAID_PARTITION) 500 if (SYS_IND(p) == LINUX_RAID_PARTITION)
501 state->parts[slot].flags = 1; 501 state->parts[slot].flags = ADDPART_FLAG_RAID;
502 if (SYS_IND(p) == DM6_PARTITION) 502 if (SYS_IND(p) == DM6_PARTITION)
503 printk("[DM]"); 503 printk("[DM]");
504 if (SYS_IND(p) == EZD_PARTITION) 504 if (SYS_IND(p) == EZD_PARTITION)
@@ -513,7 +513,7 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
513 unsigned char id = SYS_IND(p); 513 unsigned char id = SYS_IND(p);
514 int n; 514 int n;
515 515
516 if (!NR_SECTS(p)) 516 if (!nr_sects(p))
517 continue; 517 continue;
518 518
519 for (n = 0; subtypes[n].parse && id != subtypes[n].id; n++) 519 for (n = 0; subtypes[n].parse && id != subtypes[n].id; n++)
@@ -521,8 +521,8 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
521 521
522 if (!subtypes[n].parse) 522 if (!subtypes[n].parse)
523 continue; 523 continue;
524 subtypes[n].parse(state, bdev, START_SECT(p)*sector_size, 524 subtypes[n].parse(state, start_sect(p) * sector_size,
525 NR_SECTS(p)*sector_size, slot); 525 nr_sects(p) * sector_size, slot);
526 } 526 }
527 put_dev_sector(sect); 527 put_dev_sector(sect);
528 return 1; 528 return 1;
diff --git a/fs/partitions/msdos.h b/fs/partitions/msdos.h
index 01e5e0b6902d..38c781c490b3 100644
--- a/fs/partitions/msdos.h
+++ b/fs/partitions/msdos.h
@@ -4,5 +4,5 @@
4 4
5#define MSDOS_LABEL_MAGIC 0xAA55 5#define MSDOS_LABEL_MAGIC 0xAA55
6 6
7int msdos_partition(struct parsed_partitions *state, struct block_device *bdev); 7int msdos_partition(struct parsed_partitions *state);
8 8
diff --git a/fs/partitions/osf.c b/fs/partitions/osf.c
index c05c17bc5df3..fc22b85d436a 100644
--- a/fs/partitions/osf.c
+++ b/fs/partitions/osf.c
@@ -10,7 +10,7 @@
10#include "check.h" 10#include "check.h"
11#include "osf.h" 11#include "osf.h"
12 12
13int osf_partition(struct parsed_partitions *state, struct block_device *bdev) 13int osf_partition(struct parsed_partitions *state)
14{ 14{
15 int i; 15 int i;
16 int slot = 1; 16 int slot = 1;
@@ -49,7 +49,7 @@ int osf_partition(struct parsed_partitions *state, struct block_device *bdev)
49 } * label; 49 } * label;
50 struct d_partition * partition; 50 struct d_partition * partition;
51 51
52 data = read_dev_sector(bdev, 0, &sect); 52 data = read_part_sector(state, 0, &sect);
53 if (!data) 53 if (!data)
54 return -1; 54 return -1;
55 55
diff --git a/fs/partitions/osf.h b/fs/partitions/osf.h
index 427b8eab314b..20ed2315ec16 100644
--- a/fs/partitions/osf.h
+++ b/fs/partitions/osf.h
@@ -4,4 +4,4 @@
4 4
5#define DISKLABELMAGIC (0x82564557UL) 5#define DISKLABELMAGIC (0x82564557UL)
6 6
7int osf_partition(struct parsed_partitions *state, struct block_device *bdev); 7int osf_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/sgi.c b/fs/partitions/sgi.c
index ed5ac83fe83a..43b1df9aa16c 100644
--- a/fs/partitions/sgi.c
+++ b/fs/partitions/sgi.c
@@ -27,7 +27,7 @@ struct sgi_disklabel {
27 __be32 _unused1; /* Padding */ 27 __be32 _unused1; /* Padding */
28}; 28};
29 29
30int sgi_partition(struct parsed_partitions *state, struct block_device *bdev) 30int sgi_partition(struct parsed_partitions *state)
31{ 31{
32 int i, csum; 32 int i, csum;
33 __be32 magic; 33 __be32 magic;
@@ -39,7 +39,7 @@ int sgi_partition(struct parsed_partitions *state, struct block_device *bdev)
39 struct sgi_partition *p; 39 struct sgi_partition *p;
40 char b[BDEVNAME_SIZE]; 40 char b[BDEVNAME_SIZE];
41 41
42 label = (struct sgi_disklabel *) read_dev_sector(bdev, 0, &sect); 42 label = read_part_sector(state, 0, &sect);
43 if (!label) 43 if (!label)
44 return -1; 44 return -1;
45 p = &label->partitions[0]; 45 p = &label->partitions[0];
@@ -57,7 +57,7 @@ int sgi_partition(struct parsed_partitions *state, struct block_device *bdev)
57 } 57 }
58 if(csum) { 58 if(csum) {
59 printk(KERN_WARNING "Dev %s SGI disklabel: csum bad, label corrupted\n", 59 printk(KERN_WARNING "Dev %s SGI disklabel: csum bad, label corrupted\n",
60 bdevname(bdev, b)); 60 bdevname(state->bdev, b));
61 put_dev_sector(sect); 61 put_dev_sector(sect);
62 return 0; 62 return 0;
63 } 63 }
diff --git a/fs/partitions/sgi.h b/fs/partitions/sgi.h
index 5d5595c09928..b9553ebdd5a9 100644
--- a/fs/partitions/sgi.h
+++ b/fs/partitions/sgi.h
@@ -2,7 +2,7 @@
2 * fs/partitions/sgi.h 2 * fs/partitions/sgi.h
3 */ 3 */
4 4
5extern int sgi_partition(struct parsed_partitions *state, struct block_device *bdev); 5extern int sgi_partition(struct parsed_partitions *state);
6 6
7#define SGI_LABEL_MAGIC 0x0be5a941 7#define SGI_LABEL_MAGIC 0x0be5a941
8 8
diff --git a/fs/partitions/sun.c b/fs/partitions/sun.c
index c95e6a62c01d..a32660e25f7f 100644
--- a/fs/partitions/sun.c
+++ b/fs/partitions/sun.c
@@ -10,7 +10,7 @@
10#include "check.h" 10#include "check.h"
11#include "sun.h" 11#include "sun.h"
12 12
13int sun_partition(struct parsed_partitions *state, struct block_device *bdev) 13int sun_partition(struct parsed_partitions *state)
14{ 14{
15 int i; 15 int i;
16 __be16 csum; 16 __be16 csum;
@@ -61,7 +61,7 @@ int sun_partition(struct parsed_partitions *state, struct block_device *bdev)
61 int use_vtoc; 61 int use_vtoc;
62 int nparts; 62 int nparts;
63 63
64 label = (struct sun_disklabel *)read_dev_sector(bdev, 0, &sect); 64 label = read_part_sector(state, 0, &sect);
65 if (!label) 65 if (!label)
66 return -1; 66 return -1;
67 67
@@ -78,7 +78,7 @@ int sun_partition(struct parsed_partitions *state, struct block_device *bdev)
78 csum ^= *ush--; 78 csum ^= *ush--;
79 if (csum) { 79 if (csum) {
80 printk("Dev %s Sun disklabel: Csum bad, label corrupted\n", 80 printk("Dev %s Sun disklabel: Csum bad, label corrupted\n",
81 bdevname(bdev, b)); 81 bdevname(state->bdev, b));
82 put_dev_sector(sect); 82 put_dev_sector(sect);
83 return 0; 83 return 0;
84 } 84 }
diff --git a/fs/partitions/sun.h b/fs/partitions/sun.h
index 7f864d1f86d4..2424baa8319f 100644
--- a/fs/partitions/sun.h
+++ b/fs/partitions/sun.h
@@ -5,4 +5,4 @@
5#define SUN_LABEL_MAGIC 0xDABE 5#define SUN_LABEL_MAGIC 0xDABE
6#define SUN_VTOC_SANITY 0x600DDEEE 6#define SUN_VTOC_SANITY 0x600DDEEE
7 7
8int sun_partition(struct parsed_partitions *state, struct block_device *bdev); 8int sun_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/sysv68.c b/fs/partitions/sysv68.c
index 4eba27b78643..9030c864428e 100644
--- a/fs/partitions/sysv68.c
+++ b/fs/partitions/sysv68.c
@@ -46,7 +46,7 @@ struct slice {
46}; 46};
47 47
48 48
49int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev) 49int sysv68_partition(struct parsed_partitions *state)
50{ 50{
51 int i, slices; 51 int i, slices;
52 int slot = 1; 52 int slot = 1;
@@ -55,7 +55,7 @@ int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev)
55 struct dkblk0 *b; 55 struct dkblk0 *b;
56 struct slice *slice; 56 struct slice *slice;
57 57
58 data = read_dev_sector(bdev, 0, &sect); 58 data = read_part_sector(state, 0, &sect);
59 if (!data) 59 if (!data)
60 return -1; 60 return -1;
61 61
@@ -68,7 +68,7 @@ int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev)
68 i = be32_to_cpu(b->dk_ios.ios_slcblk); 68 i = be32_to_cpu(b->dk_ios.ios_slcblk);
69 put_dev_sector(sect); 69 put_dev_sector(sect);
70 70
71 data = read_dev_sector(bdev, i, &sect); 71 data = read_part_sector(state, i, &sect);
72 if (!data) 72 if (!data)
73 return -1; 73 return -1;
74 74
diff --git a/fs/partitions/sysv68.h b/fs/partitions/sysv68.h
index fa733f68431b..bf2f5ffa97ac 100644
--- a/fs/partitions/sysv68.h
+++ b/fs/partitions/sysv68.h
@@ -1 +1 @@
extern int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev); extern int sysv68_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/ultrix.c b/fs/partitions/ultrix.c
index ec852c11dce4..db9eef260364 100644
--- a/fs/partitions/ultrix.c
+++ b/fs/partitions/ultrix.c
@@ -9,7 +9,7 @@
9#include "check.h" 9#include "check.h"
10#include "ultrix.h" 10#include "ultrix.h"
11 11
12int ultrix_partition(struct parsed_partitions *state, struct block_device *bdev) 12int ultrix_partition(struct parsed_partitions *state)
13{ 13{
14 int i; 14 int i;
15 Sector sect; 15 Sector sect;
@@ -26,7 +26,7 @@ int ultrix_partition(struct parsed_partitions *state, struct block_device *bdev)
26#define PT_MAGIC 0x032957 /* Partition magic number */ 26#define PT_MAGIC 0x032957 /* Partition magic number */
27#define PT_VALID 1 /* Indicates if struct is valid */ 27#define PT_VALID 1 /* Indicates if struct is valid */
28 28
29 data = read_dev_sector(bdev, (16384 - sizeof(*label))/512, &sect); 29 data = read_part_sector(state, (16384 - sizeof(*label))/512, &sect);
30 if (!data) 30 if (!data)
31 return -1; 31 return -1;
32 32
diff --git a/fs/partitions/ultrix.h b/fs/partitions/ultrix.h
index a74bf8e2d370..a3cc00b2bded 100644
--- a/fs/partitions/ultrix.h
+++ b/fs/partitions/ultrix.h
@@ -2,4 +2,4 @@
2 * fs/partitions/ultrix.h 2 * fs/partitions/ultrix.h
3 */ 3 */
4 4
5int ultrix_partition(struct parsed_partitions *state, struct block_device *bdev); 5int ultrix_partition(struct parsed_partitions *state);
diff --git a/fs/pipe.c b/fs/pipe.c
index 37ba29ff3158..db6eaaba0dd8 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -11,6 +11,7 @@
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/init.h> 12#include <linux/init.h>
13#include <linux/fs.h> 13#include <linux/fs.h>
14#include <linux/log2.h>
14#include <linux/mount.h> 15#include <linux/mount.h>
15#include <linux/pipe_fs_i.h> 16#include <linux/pipe_fs_i.h>
16#include <linux/uio.h> 17#include <linux/uio.h>
@@ -18,11 +19,18 @@
18#include <linux/pagemap.h> 19#include <linux/pagemap.h>
19#include <linux/audit.h> 20#include <linux/audit.h>
20#include <linux/syscalls.h> 21#include <linux/syscalls.h>
22#include <linux/fcntl.h>
21 23
22#include <asm/uaccess.h> 24#include <asm/uaccess.h>
23#include <asm/ioctls.h> 25#include <asm/ioctls.h>
24 26
25/* 27/*
28 * The max size that a non-root user is allowed to grow the pipe. Can
29 * be set by root in /proc/sys/fs/pipe-max-pages
30 */
31unsigned int pipe_max_pages = PIPE_DEF_BUFFERS * 16;
32
33/*
26 * We use a start+len construction, which provides full use of the 34 * We use a start+len construction, which provides full use of the
27 * allocated memory. 35 * allocated memory.
28 * -- Florian Coosmann (FGC) 36 * -- Florian Coosmann (FGC)
@@ -222,6 +230,7 @@ void *generic_pipe_buf_map(struct pipe_inode_info *pipe,
222 230
223 return kmap(buf->page); 231 return kmap(buf->page);
224} 232}
233EXPORT_SYMBOL(generic_pipe_buf_map);
225 234
226/** 235/**
227 * generic_pipe_buf_unmap - unmap a previously mapped pipe buffer 236 * generic_pipe_buf_unmap - unmap a previously mapped pipe buffer
@@ -241,6 +250,7 @@ void generic_pipe_buf_unmap(struct pipe_inode_info *pipe,
241 } else 250 } else
242 kunmap(buf->page); 251 kunmap(buf->page);
243} 252}
253EXPORT_SYMBOL(generic_pipe_buf_unmap);
244 254
245/** 255/**
246 * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer 256 * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer
@@ -271,6 +281,7 @@ int generic_pipe_buf_steal(struct pipe_inode_info *pipe,
271 281
272 return 1; 282 return 1;
273} 283}
284EXPORT_SYMBOL(generic_pipe_buf_steal);
274 285
275/** 286/**
276 * generic_pipe_buf_get - get a reference to a &struct pipe_buffer 287 * generic_pipe_buf_get - get a reference to a &struct pipe_buffer
@@ -286,6 +297,7 @@ void generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
286{ 297{
287 page_cache_get(buf->page); 298 page_cache_get(buf->page);
288} 299}
300EXPORT_SYMBOL(generic_pipe_buf_get);
289 301
290/** 302/**
291 * generic_pipe_buf_confirm - verify contents of the pipe buffer 303 * generic_pipe_buf_confirm - verify contents of the pipe buffer
@@ -301,6 +313,7 @@ int generic_pipe_buf_confirm(struct pipe_inode_info *info,
301{ 313{
302 return 0; 314 return 0;
303} 315}
316EXPORT_SYMBOL(generic_pipe_buf_confirm);
304 317
305/** 318/**
306 * generic_pipe_buf_release - put a reference to a &struct pipe_buffer 319 * generic_pipe_buf_release - put a reference to a &struct pipe_buffer
@@ -315,6 +328,7 @@ void generic_pipe_buf_release(struct pipe_inode_info *pipe,
315{ 328{
316 page_cache_release(buf->page); 329 page_cache_release(buf->page);
317} 330}
331EXPORT_SYMBOL(generic_pipe_buf_release);
318 332
319static const struct pipe_buf_operations anon_pipe_buf_ops = { 333static const struct pipe_buf_operations anon_pipe_buf_ops = {
320 .can_merge = 1, 334 .can_merge = 1,
@@ -390,7 +404,7 @@ redo:
390 if (!buf->len) { 404 if (!buf->len) {
391 buf->ops = NULL; 405 buf->ops = NULL;
392 ops->release(pipe, buf); 406 ops->release(pipe, buf);
393 curbuf = (curbuf + 1) & (PIPE_BUFFERS-1); 407 curbuf = (curbuf + 1) & (pipe->buffers - 1);
394 pipe->curbuf = curbuf; 408 pipe->curbuf = curbuf;
395 pipe->nrbufs = --bufs; 409 pipe->nrbufs = --bufs;
396 do_wakeup = 1; 410 do_wakeup = 1;
@@ -472,7 +486,7 @@ pipe_write(struct kiocb *iocb, const struct iovec *_iov,
472 chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */ 486 chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */
473 if (pipe->nrbufs && chars != 0) { 487 if (pipe->nrbufs && chars != 0) {
474 int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) & 488 int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) &
475 (PIPE_BUFFERS-1); 489 (pipe->buffers - 1);
476 struct pipe_buffer *buf = pipe->bufs + lastbuf; 490 struct pipe_buffer *buf = pipe->bufs + lastbuf;
477 const struct pipe_buf_operations *ops = buf->ops; 491 const struct pipe_buf_operations *ops = buf->ops;
478 int offset = buf->offset + buf->len; 492 int offset = buf->offset + buf->len;
@@ -518,8 +532,8 @@ redo1:
518 break; 532 break;
519 } 533 }
520 bufs = pipe->nrbufs; 534 bufs = pipe->nrbufs;
521 if (bufs < PIPE_BUFFERS) { 535 if (bufs < pipe->buffers) {
522 int newbuf = (pipe->curbuf + bufs) & (PIPE_BUFFERS-1); 536 int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1);
523 struct pipe_buffer *buf = pipe->bufs + newbuf; 537 struct pipe_buffer *buf = pipe->bufs + newbuf;
524 struct page *page = pipe->tmp_page; 538 struct page *page = pipe->tmp_page;
525 char *src; 539 char *src;
@@ -580,7 +594,7 @@ redo2:
580 if (!total_len) 594 if (!total_len)
581 break; 595 break;
582 } 596 }
583 if (bufs < PIPE_BUFFERS) 597 if (bufs < pipe->buffers)
584 continue; 598 continue;
585 if (filp->f_flags & O_NONBLOCK) { 599 if (filp->f_flags & O_NONBLOCK) {
586 if (!ret) 600 if (!ret)
@@ -640,7 +654,7 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
640 nrbufs = pipe->nrbufs; 654 nrbufs = pipe->nrbufs;
641 while (--nrbufs >= 0) { 655 while (--nrbufs >= 0) {
642 count += pipe->bufs[buf].len; 656 count += pipe->bufs[buf].len;
643 buf = (buf+1) & (PIPE_BUFFERS-1); 657 buf = (buf+1) & (pipe->buffers - 1);
644 } 658 }
645 mutex_unlock(&inode->i_mutex); 659 mutex_unlock(&inode->i_mutex);
646 660
@@ -671,7 +685,7 @@ pipe_poll(struct file *filp, poll_table *wait)
671 } 685 }
672 686
673 if (filp->f_mode & FMODE_WRITE) { 687 if (filp->f_mode & FMODE_WRITE) {
674 mask |= (nrbufs < PIPE_BUFFERS) ? POLLOUT | POLLWRNORM : 0; 688 mask |= (nrbufs < pipe->buffers) ? POLLOUT | POLLWRNORM : 0;
675 /* 689 /*
676 * Most Unices do not set POLLERR for FIFOs but on Linux they 690 * Most Unices do not set POLLERR for FIFOs but on Linux they
677 * behave exactly like pipes for poll(). 691 * behave exactly like pipes for poll().
@@ -877,25 +891,32 @@ struct pipe_inode_info * alloc_pipe_info(struct inode *inode)
877 891
878 pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL); 892 pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL);
879 if (pipe) { 893 if (pipe) {
880 init_waitqueue_head(&pipe->wait); 894 pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * PIPE_DEF_BUFFERS, GFP_KERNEL);
881 pipe->r_counter = pipe->w_counter = 1; 895 if (pipe->bufs) {
882 pipe->inode = inode; 896 init_waitqueue_head(&pipe->wait);
897 pipe->r_counter = pipe->w_counter = 1;
898 pipe->inode = inode;
899 pipe->buffers = PIPE_DEF_BUFFERS;
900 return pipe;
901 }
902 kfree(pipe);
883 } 903 }
884 904
885 return pipe; 905 return NULL;
886} 906}
887 907
888void __free_pipe_info(struct pipe_inode_info *pipe) 908void __free_pipe_info(struct pipe_inode_info *pipe)
889{ 909{
890 int i; 910 int i;
891 911
892 for (i = 0; i < PIPE_BUFFERS; i++) { 912 for (i = 0; i < pipe->buffers; i++) {
893 struct pipe_buffer *buf = pipe->bufs + i; 913 struct pipe_buffer *buf = pipe->bufs + i;
894 if (buf->ops) 914 if (buf->ops)
895 buf->ops->release(pipe, buf); 915 buf->ops->release(pipe, buf);
896 } 916 }
897 if (pipe->tmp_page) 917 if (pipe->tmp_page)
898 __free_page(pipe->tmp_page); 918 __free_page(pipe->tmp_page);
919 kfree(pipe->bufs);
899 kfree(pipe); 920 kfree(pipe);
900} 921}
901 922
@@ -1094,6 +1115,94 @@ SYSCALL_DEFINE1(pipe, int __user *, fildes)
1094} 1115}
1095 1116
1096/* 1117/*
1118 * Allocate a new array of pipe buffers and copy the info over. Returns the
1119 * pipe size if successful, or return -ERROR on error.
1120 */
1121static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
1122{
1123 struct pipe_buffer *bufs;
1124
1125 /*
1126 * Must be a power-of-2 currently
1127 */
1128 if (!is_power_of_2(arg))
1129 return -EINVAL;
1130
1131 /*
1132 * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't
1133 * expect a lot of shrink+grow operations, just free and allocate
1134 * again like we would do for growing. If the pipe currently
1135 * contains more buffers than arg, then return busy.
1136 */
1137 if (arg < pipe->nrbufs)
1138 return -EBUSY;
1139
1140 bufs = kcalloc(arg, sizeof(struct pipe_buffer), GFP_KERNEL);
1141 if (unlikely(!bufs))
1142 return -ENOMEM;
1143
1144 /*
1145 * The pipe array wraps around, so just start the new one at zero
1146 * and adjust the indexes.
1147 */
1148 if (pipe->nrbufs) {
1149 const unsigned int tail = pipe->nrbufs & (pipe->buffers - 1);
1150 const unsigned int head = pipe->nrbufs - tail;
1151
1152 if (head)
1153 memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer));
1154 if (tail)
1155 memcpy(bufs + head, pipe->bufs + pipe->curbuf, tail * sizeof(struct pipe_buffer));
1156 }
1157
1158 pipe->curbuf = 0;
1159 kfree(pipe->bufs);
1160 pipe->bufs = bufs;
1161 pipe->buffers = arg;
1162 return arg;
1163}
1164
1165long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
1166{
1167 struct pipe_inode_info *pipe;
1168 long ret;
1169
1170 pipe = file->f_path.dentry->d_inode->i_pipe;
1171 if (!pipe)
1172 return -EBADF;
1173
1174 mutex_lock(&pipe->inode->i_mutex);
1175
1176 switch (cmd) {
1177 case F_SETPIPE_SZ:
1178 if (!capable(CAP_SYS_ADMIN) && arg > pipe_max_pages) {
1179 ret = -EINVAL;
1180 goto out;
1181 }
1182 /*
1183 * The pipe needs to be at least 2 pages large to
1184 * guarantee POSIX behaviour.
1185 */
1186 if (arg < 2) {
1187 ret = -EINVAL;
1188 goto out;
1189 }
1190 ret = pipe_set_size(pipe, arg);
1191 break;
1192 case F_GETPIPE_SZ:
1193 ret = pipe->buffers;
1194 break;
1195 default:
1196 ret = -EINVAL;
1197 break;
1198 }
1199
1200out:
1201 mutex_unlock(&pipe->inode->i_mutex);
1202 return ret;
1203}
1204
1205/*
1097 * pipefs should _never_ be mounted by userland - too much of security hassle, 1206 * pipefs should _never_ be mounted by userland - too much of security hassle,
1098 * no real gain from having the whole whorehouse mounted. So we don't need 1207 * no real gain from having the whole whorehouse mounted. So we don't need
1099 * any operations on the root directory. However, we need a non-trivial 1208 * any operations on the root directory. However, we need a non-trivial
diff --git a/fs/proc/array.c b/fs/proc/array.c
index aa8637b81028..9b58d38bc911 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -68,7 +68,6 @@
68#include <linux/hugetlb.h> 68#include <linux/hugetlb.h>
69#include <linux/pagemap.h> 69#include <linux/pagemap.h>
70#include <linux/swap.h> 70#include <linux/swap.h>
71#include <linux/slab.h>
72#include <linux/smp.h> 71#include <linux/smp.h>
73#include <linux/signal.h> 72#include <linux/signal.h>
74#include <linux/highmem.h> 73#include <linux/highmem.h>
@@ -82,7 +81,6 @@
82#include <linux/pid_namespace.h> 81#include <linux/pid_namespace.h>
83#include <linux/ptrace.h> 82#include <linux/ptrace.h>
84#include <linux/tracehook.h> 83#include <linux/tracehook.h>
85#include <linux/swapops.h>
86 84
87#include <asm/pgtable.h> 85#include <asm/pgtable.h>
88#include <asm/processor.h> 86#include <asm/processor.h>
@@ -269,7 +267,7 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
269 shpending = p->signal->shared_pending.signal; 267 shpending = p->signal->shared_pending.signal;
270 blocked = p->blocked; 268 blocked = p->blocked;
271 collect_sigign_sigcatch(p, &ignored, &caught); 269 collect_sigign_sigcatch(p, &ignored, &caught);
272 num_threads = atomic_read(&p->signal->count); 270 num_threads = get_nr_threads(p);
273 rcu_read_lock(); /* FIXME: is this correct? */ 271 rcu_read_lock(); /* FIXME: is this correct? */
274 qsize = atomic_read(&__task_cred(p)->user->sigpending); 272 qsize = atomic_read(&__task_cred(p)->user->sigpending);
275 rcu_read_unlock(); 273 rcu_read_unlock();
@@ -412,7 +410,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
412 tty_nr = new_encode_dev(tty_devnum(sig->tty)); 410 tty_nr = new_encode_dev(tty_devnum(sig->tty));
413 } 411 }
414 412
415 num_threads = atomic_read(&sig->count); 413 num_threads = get_nr_threads(task);
416 collect_sigign_sigcatch(task, &sigign, &sigcatch); 414 collect_sigign_sigcatch(task, &sigign, &sigcatch);
417 415
418 cmin_flt = sig->cmin_flt; 416 cmin_flt = sig->cmin_flt;
@@ -496,7 +494,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
496 rsslim, 494 rsslim,
497 mm ? mm->start_code : 0, 495 mm ? mm->start_code : 0,
498 mm ? mm->end_code : 0, 496 mm ? mm->end_code : 0,
499 (permitted && mm) ? task->stack_start : 0, 497 (permitted && mm) ? mm->start_stack : 0,
500 esp, 498 esp,
501 eip, 499 eip,
502 /* The signal information here is obsolete. 500 /* The signal information here is obsolete.
diff --git a/fs/proc/base.c b/fs/proc/base.c
index a7310841c831..acb7ef80ea4f 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -81,6 +81,7 @@
81#include <linux/elf.h> 81#include <linux/elf.h>
82#include <linux/pid_namespace.h> 82#include <linux/pid_namespace.h>
83#include <linux/fs_struct.h> 83#include <linux/fs_struct.h>
84#include <linux/slab.h>
84#include "internal.h" 85#include "internal.h"
85 86
86/* NOTE: 87/* NOTE:
@@ -165,18 +166,6 @@ static int get_fs_path(struct task_struct *task, struct path *path, bool root)
165 return result; 166 return result;
166} 167}
167 168
168static int get_nr_threads(struct task_struct *tsk)
169{
170 unsigned long flags;
171 int count = 0;
172
173 if (lock_task_sighand(tsk, &flags)) {
174 count = atomic_read(&tsk->signal->count);
175 unlock_task_sighand(tsk, &flags);
176 }
177 return count;
178}
179
180static int proc_cwd_link(struct inode *inode, struct path *path) 169static int proc_cwd_link(struct inode *inode, struct path *path)
181{ 170{
182 struct task_struct *task = get_proc_task(inode); 171 struct task_struct *task = get_proc_task(inode);
@@ -442,12 +431,13 @@ static const struct file_operations proc_lstats_operations = {
442unsigned long badness(struct task_struct *p, unsigned long uptime); 431unsigned long badness(struct task_struct *p, unsigned long uptime);
443static int proc_oom_score(struct task_struct *task, char *buffer) 432static int proc_oom_score(struct task_struct *task, char *buffer)
444{ 433{
445 unsigned long points; 434 unsigned long points = 0;
446 struct timespec uptime; 435 struct timespec uptime;
447 436
448 do_posix_clock_monotonic_gettime(&uptime); 437 do_posix_clock_monotonic_gettime(&uptime);
449 read_lock(&tasklist_lock); 438 read_lock(&tasklist_lock);
450 points = badness(task->group_leader, uptime.tv_sec); 439 if (pid_alive(task))
440 points = badness(task, uptime.tv_sec);
451 read_unlock(&tasklist_lock); 441 read_unlock(&tasklist_lock);
452 return sprintf(buffer, "%lu\n", points); 442 return sprintf(buffer, "%lu\n", points);
453} 443}
@@ -728,6 +718,7 @@ out_no_task:
728 718
729static const struct file_operations proc_info_file_operations = { 719static const struct file_operations proc_info_file_operations = {
730 .read = proc_info_read, 720 .read = proc_info_read,
721 .llseek = generic_file_llseek,
731}; 722};
732 723
733static int proc_single_show(struct seq_file *m, void *v) 724static int proc_single_show(struct seq_file *m, void *v)
@@ -985,6 +976,7 @@ out_no_task:
985 976
986static const struct file_operations proc_environ_operations = { 977static const struct file_operations proc_environ_operations = {
987 .read = environ_read, 978 .read = environ_read,
979 .llseek = generic_file_llseek,
988}; 980};
989 981
990static ssize_t oom_adjust_read(struct file *file, char __user *buf, 982static ssize_t oom_adjust_read(struct file *file, char __user *buf,
@@ -1058,6 +1050,7 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
1058static const struct file_operations proc_oom_adjust_operations = { 1050static const struct file_operations proc_oom_adjust_operations = {
1059 .read = oom_adjust_read, 1051 .read = oom_adjust_read,
1060 .write = oom_adjust_write, 1052 .write = oom_adjust_write,
1053 .llseek = generic_file_llseek,
1061}; 1054};
1062 1055
1063#ifdef CONFIG_AUDITSYSCALL 1056#ifdef CONFIG_AUDITSYSCALL
@@ -1129,6 +1122,7 @@ out_free_page:
1129static const struct file_operations proc_loginuid_operations = { 1122static const struct file_operations proc_loginuid_operations = {
1130 .read = proc_loginuid_read, 1123 .read = proc_loginuid_read,
1131 .write = proc_loginuid_write, 1124 .write = proc_loginuid_write,
1125 .llseek = generic_file_llseek,
1132}; 1126};
1133 1127
1134static ssize_t proc_sessionid_read(struct file * file, char __user * buf, 1128static ssize_t proc_sessionid_read(struct file * file, char __user * buf,
@@ -1149,6 +1143,7 @@ static ssize_t proc_sessionid_read(struct file * file, char __user * buf,
1149 1143
1150static const struct file_operations proc_sessionid_operations = { 1144static const struct file_operations proc_sessionid_operations = {
1151 .read = proc_sessionid_read, 1145 .read = proc_sessionid_read,
1146 .llseek = generic_file_llseek,
1152}; 1147};
1153#endif 1148#endif
1154 1149
@@ -1200,6 +1195,7 @@ static ssize_t proc_fault_inject_write(struct file * file,
1200static const struct file_operations proc_fault_inject_operations = { 1195static const struct file_operations proc_fault_inject_operations = {
1201 .read = proc_fault_inject_read, 1196 .read = proc_fault_inject_read,
1202 .write = proc_fault_inject_write, 1197 .write = proc_fault_inject_write,
1198 .llseek = generic_file_llseek,
1203}; 1199};
1204#endif 1200#endif
1205 1201
@@ -1941,7 +1937,7 @@ static ssize_t proc_fdinfo_read(struct file *file, char __user *buf,
1941} 1937}
1942 1938
1943static const struct file_operations proc_fdinfo_file_operations = { 1939static const struct file_operations proc_fdinfo_file_operations = {
1944 .open = nonseekable_open, 1940 .open = nonseekable_open,
1945 .read = proc_fdinfo_read, 1941 .read = proc_fdinfo_read,
1946}; 1942};
1947 1943
@@ -2225,6 +2221,7 @@ out_no_task:
2225static const struct file_operations proc_pid_attr_operations = { 2221static const struct file_operations proc_pid_attr_operations = {
2226 .read = proc_pid_attr_read, 2222 .read = proc_pid_attr_read,
2227 .write = proc_pid_attr_write, 2223 .write = proc_pid_attr_write,
2224 .llseek = generic_file_llseek,
2228}; 2225};
2229 2226
2230static const struct pid_entry attr_dir_stuff[] = { 2227static const struct pid_entry attr_dir_stuff[] = {
@@ -2345,6 +2342,7 @@ static ssize_t proc_coredump_filter_write(struct file *file,
2345static const struct file_operations proc_coredump_filter_operations = { 2342static const struct file_operations proc_coredump_filter_operations = {
2346 .read = proc_coredump_filter_read, 2343 .read = proc_coredump_filter_read,
2347 .write = proc_coredump_filter_write, 2344 .write = proc_coredump_filter_write,
2345 .llseek = generic_file_llseek,
2348}; 2346};
2349#endif 2347#endif
2350 2348
@@ -2434,7 +2432,7 @@ static struct dentry *proc_base_instantiate(struct inode *dir,
2434 const struct pid_entry *p = ptr; 2432 const struct pid_entry *p = ptr;
2435 struct inode *inode; 2433 struct inode *inode;
2436 struct proc_inode *ei; 2434 struct proc_inode *ei;
2437 struct dentry *error = ERR_PTR(-EINVAL); 2435 struct dentry *error;
2438 2436
2439 /* Allocate the inode */ 2437 /* Allocate the inode */
2440 error = ERR_PTR(-ENOMEM); 2438 error = ERR_PTR(-ENOMEM);
@@ -2784,7 +2782,7 @@ out:
2784 2782
2785struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) 2783struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
2786{ 2784{
2787 struct dentry *result = ERR_PTR(-ENOENT); 2785 struct dentry *result;
2788 struct task_struct *task; 2786 struct task_struct *task;
2789 unsigned tgid; 2787 unsigned tgid;
2790 struct pid_namespace *ns; 2788 struct pid_namespace *ns;
@@ -2907,7 +2905,7 @@ out_no_task:
2907 */ 2905 */
2908static const struct pid_entry tid_base_stuff[] = { 2906static const struct pid_entry tid_base_stuff[] = {
2909 DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), 2907 DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
2910 DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fd_operations), 2908 DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
2911 REG("environ", S_IRUSR, proc_environ_operations), 2909 REG("environ", S_IRUSR, proc_environ_operations),
2912 INF("auxv", S_IRUSR, proc_pid_auxv), 2910 INF("auxv", S_IRUSR, proc_pid_auxv),
2913 ONE("status", S_IRUGO, proc_pid_status), 2911 ONE("status", S_IRUGO, proc_pid_status),
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 08f4d71dacd7..2791907744ed 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -13,6 +13,7 @@
13#include <linux/proc_fs.h> 13#include <linux/proc_fs.h>
14#include <linux/stat.h> 14#include <linux/stat.h>
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/slab.h>
16#include <linux/mount.h> 17#include <linux/mount.h>
17#include <linux/init.h> 18#include <linux/init.h>
18#include <linux/idr.h> 19#include <linux/idr.h>
@@ -342,21 +343,6 @@ static DEFINE_SPINLOCK(proc_inum_lock); /* protects the above */
342/* 343/*
343 * Return an inode number between PROC_DYNAMIC_FIRST and 344 * Return an inode number between PROC_DYNAMIC_FIRST and
344 * 0xffffffff, or zero on failure. 345 * 0xffffffff, or zero on failure.
345 *
346 * Current inode allocations in the proc-fs (hex-numbers):
347 *
348 * 00000000 reserved
349 * 00000001-00000fff static entries (goners)
350 * 001 root-ino
351 *
352 * 00001000-00001fff unused
353 * 0001xxxx-7fffxxxx pid-dir entries for pid 1-7fff
354 * 80000000-efffffff unused
355 * f0000000-ffffffff dynamic entries
356 *
357 * Goal:
358 * Once we split the thing into several virtual filesystems,
359 * we will get rid of magical ranges (and this comment, BTW).
360 */ 346 */
361static unsigned int get_inode_number(void) 347static unsigned int get_inode_number(void)
362{ 348{
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 445a02bcaab3..aea8502e58a3 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -18,6 +18,7 @@
18#include <linux/module.h> 18#include <linux/module.h>
19#include <linux/smp_lock.h> 19#include <linux/smp_lock.h>
20#include <linux/sysctl.h> 20#include <linux/sysctl.h>
21#include <linux/slab.h>
21 22
22#include <asm/system.h> 23#include <asm/system.h>
23#include <asm/uaccess.h> 24#include <asm/uaccess.h>
@@ -231,9 +232,9 @@ static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigne
231 if (rv == -ENOIOCTLCMD) 232 if (rv == -ENOIOCTLCMD)
232 rv = -EINVAL; 233 rv = -EINVAL;
233 } else if (ioctl) { 234 } else if (ioctl) {
234 lock_kernel(); 235 WARN_ONCE(1, "Procfs ioctl handlers must use unlocked_ioctl, "
236 "%pf will be called without the Bkl held\n", ioctl);
235 rv = ioctl(file->f_path.dentry->d_inode, file, cmd, arg); 237 rv = ioctl(file->f_path.dentry->d_inode, file, cmd, arg);
236 unlock_kernel();
237 } 238 }
238 239
239 pde_users_dec(pde); 240 pde_users_dec(pde);
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index a44a7897fd4d..6f37c391468d 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -19,6 +19,7 @@
19#include <linux/highmem.h> 19#include <linux/highmem.h>
20#include <linux/bootmem.h> 20#include <linux/bootmem.h>
21#include <linux/init.h> 21#include <linux/init.h>
22#include <linux/slab.h>
22#include <asm/uaccess.h> 23#include <asm/uaccess.h>
23#include <asm/io.h> 24#include <asm/io.h>
24#include <linux/list.h> 25#include <linux/list.h>
@@ -490,7 +491,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
490 } 491 }
491 read_unlock(&kclist_lock); 492 read_unlock(&kclist_lock);
492 493
493 if (m == NULL) { 494 if (&m->list == &kclist_head) {
494 if (clear_user(buffer, tsz)) 495 if (clear_user(buffer, tsz))
495 return -EFAULT; 496 return -EFAULT;
496 } else if (is_vmalloc_or_module_addr((void *)start)) { 497 } else if (is_vmalloc_or_module_addr((void *)start)) {
@@ -557,6 +558,7 @@ static int open_kcore(struct inode *inode, struct file *filp)
557static const struct file_operations proc_kcore_operations = { 558static const struct file_operations proc_kcore_operations = {
558 .read = read_kcore, 559 .read = read_kcore,
559 .open = open_kcore, 560 .open = open_kcore,
561 .llseek = generic_file_llseek,
560}; 562};
561 563
562#ifdef CONFIG_MEMORY_HOTPLUG 564#ifdef CONFIG_MEMORY_HOTPLUG
@@ -586,7 +588,7 @@ static struct kcore_list kcore_text;
586 */ 588 */
587static void __init proc_kcore_text_init(void) 589static void __init proc_kcore_text_init(void)
588{ 590{
589 kclist_add(&kcore_text, _stext, _end - _stext, KCORE_TEXT); 591 kclist_add(&kcore_text, _text, _end - _text, KCORE_TEXT);
590} 592}
591#else 593#else
592static void __init proc_kcore_text_init(void) 594static void __init proc_kcore_text_init(void)
diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c
index cfe90a48a6e8..bd4b5a740ff1 100644
--- a/fs/proc/kmsg.c
+++ b/fs/proc/kmsg.c
@@ -53,6 +53,7 @@ static const struct file_operations proc_kmsg_operations = {
53 .poll = kmsg_poll, 53 .poll = kmsg_poll,
54 .open = kmsg_open, 54 .open = kmsg_open,
55 .release = kmsg_release, 55 .release = kmsg_release,
56 .llseek = generic_file_llseek,
56}; 57};
57 58
58static int __init proc_kmsg_init(void) 59static int __init proc_kmsg_init(void)
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index 9fe7d7ebe115..b1822dde55c2 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -21,7 +21,6 @@
21#include <linux/mmzone.h> 21#include <linux/mmzone.h>
22#include <linux/pagemap.h> 22#include <linux/pagemap.h>
23#include <linux/swap.h> 23#include <linux/swap.h>
24#include <linux/slab.h>
25#include <linux/smp.h> 24#include <linux/smp.h>
26#include <linux/seq_file.h> 25#include <linux/seq_file.h>
27#include <linux/hugetlb.h> 26#include <linux/hugetlb.h>
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index f8650dce74fb..ce94801f48ca 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -12,6 +12,7 @@
12#include <linux/string.h> 12#include <linux/string.h>
13#include <linux/of.h> 13#include <linux/of.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/slab.h>
15#include <asm/prom.h> 16#include <asm/prom.h>
16#include <asm/uaccess.h> 17#include <asm/uaccess.h>
17#include "internal.h" 18#include "internal.h"
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 04d1270f1c38..9020ac15baaa 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -14,6 +14,7 @@
14#include <linux/time.h> 14#include <linux/time.h>
15#include <linux/proc_fs.h> 15#include <linux/proc_fs.h>
16#include <linux/stat.h> 16#include <linux/stat.h>
17#include <linux/slab.h>
17#include <linux/init.h> 18#include <linux/init.h>
18#include <linux/sched.h> 19#include <linux/sched.h>
19#include <linux/module.h> 20#include <linux/module.h>
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 757c069f2a65..4258384ed22d 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -110,7 +110,6 @@ void __init proc_root_init(void)
110 if (err) 110 if (err)
111 return; 111 return;
112 proc_mnt = kern_mount_data(&proc_fs_type, &init_pid_ns); 112 proc_mnt = kern_mount_data(&proc_fs_type, &init_pid_ns);
113 err = PTR_ERR(proc_mnt);
114 if (IS_ERR(proc_mnt)) { 113 if (IS_ERR(proc_mnt)) {
115 unregister_filesystem(&proc_fs_type); 114 unregister_filesystem(&proc_fs_type);
116 return; 115 return;
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index b9b7aad2003d..bf31b03fc275 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -1,6 +1,5 @@
1#include <linux/cpumask.h> 1#include <linux/cpumask.h>
2#include <linux/fs.h> 2#include <linux/fs.h>
3#include <linux/gfp.h>
4#include <linux/init.h> 3#include <linux/init.h>
5#include <linux/interrupt.h> 4#include <linux/interrupt.h>
6#include <linux/kernel_stat.h> 5#include <linux/kernel_stat.h>
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 183f8ff5f400..aea1d3f1ffb5 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -4,6 +4,7 @@
4#include <linux/seq_file.h> 4#include <linux/seq_file.h>
5#include <linux/highmem.h> 5#include <linux/highmem.h>
6#include <linux/ptrace.h> 6#include <linux/ptrace.h>
7#include <linux/slab.h>
7#include <linux/pagemap.h> 8#include <linux/pagemap.h>
8#include <linux/mempolicy.h> 9#include <linux/mempolicy.h>
9#include <linux/swap.h> 10#include <linux/swap.h>
@@ -246,25 +247,6 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
246 } else if (vma->vm_start <= mm->start_stack && 247 } else if (vma->vm_start <= mm->start_stack &&
247 vma->vm_end >= mm->start_stack) { 248 vma->vm_end >= mm->start_stack) {
248 name = "[stack]"; 249 name = "[stack]";
249 } else {
250 unsigned long stack_start;
251 struct proc_maps_private *pmp;
252
253 pmp = m->private;
254 stack_start = pmp->task->stack_start;
255
256 if (vma->vm_start <= stack_start &&
257 vma->vm_end >= stack_start) {
258 pad_len_spaces(m, len);
259 seq_printf(m,
260 "[threadstack:%08lx]",
261#ifdef CONFIG_STACK_GROWSUP
262 vma->vm_end - stack_start
263#else
264 stack_start - vma->vm_start
265#endif
266 );
267 }
268 } 250 }
269 } else { 251 } else {
270 name = "[vdso]"; 252 name = "[vdso]";
@@ -406,6 +388,7 @@ static int show_smap(struct seq_file *m, void *v)
406 388
407 memset(&mss, 0, sizeof mss); 389 memset(&mss, 0, sizeof mss);
408 mss.vma = vma; 390 mss.vma = vma;
391 /* mmap_sem is held in m_start */
409 if (vma->vm_mm && !is_vm_hugetlb_page(vma)) 392 if (vma->vm_mm && !is_vm_hugetlb_page(vma))
410 walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk); 393 walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk);
411 394
@@ -552,7 +535,8 @@ const struct file_operations proc_clear_refs_operations = {
552}; 535};
553 536
554struct pagemapread { 537struct pagemapread {
555 u64 __user *out, *end; 538 int pos, len;
539 u64 *buffer;
556}; 540};
557 541
558#define PM_ENTRY_BYTES sizeof(u64) 542#define PM_ENTRY_BYTES sizeof(u64)
@@ -575,10 +559,8 @@ struct pagemapread {
575static int add_to_pagemap(unsigned long addr, u64 pfn, 559static int add_to_pagemap(unsigned long addr, u64 pfn,
576 struct pagemapread *pm) 560 struct pagemapread *pm)
577{ 561{
578 if (put_user(pfn, pm->out)) 562 pm->buffer[pm->pos++] = pfn;
579 return -EFAULT; 563 if (pm->pos >= pm->len)
580 pm->out++;
581 if (pm->out >= pm->end)
582 return PM_END_OF_BUFFER; 564 return PM_END_OF_BUFFER;
583 return 0; 565 return 0;
584} 566}
@@ -652,6 +634,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
652 return err; 634 return err;
653} 635}
654 636
637#ifdef CONFIG_HUGETLB_PAGE
655static u64 huge_pte_to_pagemap_entry(pte_t pte, int offset) 638static u64 huge_pte_to_pagemap_entry(pte_t pte, int offset)
656{ 639{
657 u64 pme = 0; 640 u64 pme = 0;
@@ -661,31 +644,18 @@ static u64 huge_pte_to_pagemap_entry(pte_t pte, int offset)
661 return pme; 644 return pme;
662} 645}
663 646
664static int pagemap_hugetlb_range(pte_t *pte, unsigned long addr, 647/* This function walks within one hugetlb entry in the single call */
665 unsigned long end, struct mm_walk *walk) 648static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
649 unsigned long addr, unsigned long end,
650 struct mm_walk *walk)
666{ 651{
667 struct vm_area_struct *vma;
668 struct pagemapread *pm = walk->private; 652 struct pagemapread *pm = walk->private;
669 struct hstate *hs = NULL;
670 int err = 0; 653 int err = 0;
654 u64 pfn;
671 655
672 vma = find_vma(walk->mm, addr);
673 if (vma)
674 hs = hstate_vma(vma);
675 for (; addr != end; addr += PAGE_SIZE) { 656 for (; addr != end; addr += PAGE_SIZE) {
676 u64 pfn = PM_NOT_PRESENT; 657 int offset = (addr & ~hmask) >> PAGE_SHIFT;
677 658 pfn = huge_pte_to_pagemap_entry(*pte, offset);
678 if (vma && (addr >= vma->vm_end)) {
679 vma = find_vma(walk->mm, addr);
680 if (vma)
681 hs = hstate_vma(vma);
682 }
683
684 if (vma && (vma->vm_start <= addr) && is_vm_hugetlb_page(vma)) {
685 /* calculate pfn of the "raw" page in the hugepage. */
686 int offset = (addr & ~huge_page_mask(hs)) >> PAGE_SHIFT;
687 pfn = huge_pte_to_pagemap_entry(*pte, offset);
688 }
689 err = add_to_pagemap(addr, pfn, pm); 659 err = add_to_pagemap(addr, pfn, pm);
690 if (err) 660 if (err)
691 return err; 661 return err;
@@ -695,6 +665,7 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long addr,
695 665
696 return err; 666 return err;
697} 667}
668#endif /* HUGETLB_PAGE */
698 669
699/* 670/*
700 * /proc/pid/pagemap - an array mapping virtual pages to pfns 671 * /proc/pid/pagemap - an array mapping virtual pages to pfns
@@ -720,21 +691,20 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long addr,
720 * determine which areas of memory are actually mapped and llseek to 691 * determine which areas of memory are actually mapped and llseek to
721 * skip over unmapped regions. 692 * skip over unmapped regions.
722 */ 693 */
694#define PAGEMAP_WALK_SIZE (PMD_SIZE)
723static ssize_t pagemap_read(struct file *file, char __user *buf, 695static ssize_t pagemap_read(struct file *file, char __user *buf,
724 size_t count, loff_t *ppos) 696 size_t count, loff_t *ppos)
725{ 697{
726 struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); 698 struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
727 struct page **pages, *page;
728 unsigned long uaddr, uend;
729 struct mm_struct *mm; 699 struct mm_struct *mm;
730 struct pagemapread pm; 700 struct pagemapread pm;
731 int pagecount;
732 int ret = -ESRCH; 701 int ret = -ESRCH;
733 struct mm_walk pagemap_walk = {}; 702 struct mm_walk pagemap_walk = {};
734 unsigned long src; 703 unsigned long src;
735 unsigned long svpfn; 704 unsigned long svpfn;
736 unsigned long start_vaddr; 705 unsigned long start_vaddr;
737 unsigned long end_vaddr; 706 unsigned long end_vaddr;
707 int copied = 0;
738 708
739 if (!task) 709 if (!task)
740 goto out; 710 goto out;
@@ -757,38 +727,17 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
757 if (!mm) 727 if (!mm)
758 goto out_task; 728 goto out_task;
759 729
760 730 pm.len = PM_ENTRY_BYTES * (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
761 uaddr = (unsigned long)buf & PAGE_MASK; 731 pm.buffer = kmalloc(pm.len, GFP_TEMPORARY);
762 uend = (unsigned long)(buf + count);
763 pagecount = (PAGE_ALIGN(uend) - uaddr) / PAGE_SIZE;
764 ret = 0;
765 if (pagecount == 0)
766 goto out_mm;
767 pages = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL);
768 ret = -ENOMEM; 732 ret = -ENOMEM;
769 if (!pages) 733 if (!pm.buffer)
770 goto out_mm; 734 goto out_mm;
771 735
772 down_read(&current->mm->mmap_sem);
773 ret = get_user_pages(current, current->mm, uaddr, pagecount,
774 1, 0, pages, NULL);
775 up_read(&current->mm->mmap_sem);
776
777 if (ret < 0)
778 goto out_free;
779
780 if (ret != pagecount) {
781 pagecount = ret;
782 ret = -EFAULT;
783 goto out_pages;
784 }
785
786 pm.out = (u64 __user *)buf;
787 pm.end = (u64 __user *)(buf + count);
788
789 pagemap_walk.pmd_entry = pagemap_pte_range; 736 pagemap_walk.pmd_entry = pagemap_pte_range;
790 pagemap_walk.pte_hole = pagemap_pte_hole; 737 pagemap_walk.pte_hole = pagemap_pte_hole;
738#ifdef CONFIG_HUGETLB_PAGE
791 pagemap_walk.hugetlb_entry = pagemap_hugetlb_range; 739 pagemap_walk.hugetlb_entry = pagemap_hugetlb_range;
740#endif
792 pagemap_walk.mm = mm; 741 pagemap_walk.mm = mm;
793 pagemap_walk.private = &pm; 742 pagemap_walk.private = &pm;
794 743
@@ -807,23 +756,36 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
807 * user buffer is tracked in "pm", and the walk 756 * user buffer is tracked in "pm", and the walk
808 * will stop when we hit the end of the buffer. 757 * will stop when we hit the end of the buffer.
809 */ 758 */
810 ret = walk_page_range(start_vaddr, end_vaddr, &pagemap_walk); 759 ret = 0;
811 if (ret == PM_END_OF_BUFFER) 760 while (count && (start_vaddr < end_vaddr)) {
812 ret = 0; 761 int len;
813 /* don't need mmap_sem for these, but this looks cleaner */ 762 unsigned long end;
814 *ppos += (char __user *)pm.out - buf; 763
815 if (!ret) 764 pm.pos = 0;
816 ret = (char __user *)pm.out - buf; 765 end = start_vaddr + PAGEMAP_WALK_SIZE;
817 766 /* overflow ? */
818out_pages: 767 if (end < start_vaddr || end > end_vaddr)
819 for (; pagecount; pagecount--) { 768 end = end_vaddr;
820 page = pages[pagecount-1]; 769 down_read(&mm->mmap_sem);
821 if (!PageReserved(page)) 770 ret = walk_page_range(start_vaddr, end, &pagemap_walk);
822 SetPageDirty(page); 771 up_read(&mm->mmap_sem);
823 page_cache_release(page); 772 start_vaddr = end;
773
774 len = min(count, PM_ENTRY_BYTES * pm.pos);
775 if (copy_to_user(buf, pm.buffer, len)) {
776 ret = -EFAULT;
777 goto out_free;
778 }
779 copied += len;
780 buf += len;
781 count -= len;
824 } 782 }
783 *ppos += copied;
784 if (!ret || ret == PM_END_OF_BUFFER)
785 ret = copied;
786
825out_free: 787out_free:
826 kfree(pages); 788 kfree(pm.buffer);
827out_mm: 789out_mm:
828 mmput(mm); 790 mmput(mm);
829out_task: 791out_task:
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 5d9fd64ef81a..46d4b5d72bd3 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -5,6 +5,7 @@
5#include <linux/fs_struct.h> 5#include <linux/fs_struct.h>
6#include <linux/mount.h> 6#include <linux/mount.h>
7#include <linux/ptrace.h> 7#include <linux/ptrace.h>
8#include <linux/slab.h>
8#include <linux/seq_file.h> 9#include <linux/seq_file.h>
9#include "internal.h" 10#include "internal.h"
10 11
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 0872afa58d39..91c817ff02c3 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -12,6 +12,7 @@
12#include <linux/user.h> 12#include <linux/user.h>
13#include <linux/elf.h> 13#include <linux/elf.h>
14#include <linux/elfcore.h> 14#include <linux/elfcore.h>
15#include <linux/slab.h>
15#include <linux/highmem.h> 16#include <linux/highmem.h>
16#include <linux/bootmem.h> 17#include <linux/bootmem.h>
17#include <linux/init.h> 18#include <linux/init.h>
@@ -162,6 +163,7 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
162 163
163static const struct file_operations proc_vmcore_operations = { 164static const struct file_operations proc_vmcore_operations = {
164 .read = read_vmcore, 165 .read = read_vmcore,
166 .llseek = generic_file_llseek,
165}; 167};
166 168
167static struct vmcore* __init get_new_element(void) 169static struct vmcore* __init get_new_element(void)
diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
index 6f30c3d5bcbf..6e8fc62b40a8 100644
--- a/fs/qnx4/dir.c
+++ b/fs/qnx4/dir.c
@@ -77,9 +77,10 @@ out:
77 77
78const struct file_operations qnx4_dir_operations = 78const struct file_operations qnx4_dir_operations =
79{ 79{
80 .llseek = generic_file_llseek,
80 .read = generic_read_dir, 81 .read = generic_read_dir,
81 .readdir = qnx4_readdir, 82 .readdir = qnx4_readdir,
82 .fsync = simple_fsync, 83 .fsync = generic_file_fsync,
83}; 84};
84 85
85const struct inode_operations qnx4_dir_inode_operations = 86const struct inode_operations qnx4_dir_inode_operations =
diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig
index dad7fb247ddc..3e21b1e2ad3a 100644
--- a/fs/quota/Kconfig
+++ b/fs/quota/Kconfig
@@ -33,6 +33,14 @@ config PRINT_QUOTA_WARNING
33 Note that this behavior is currently deprecated and may go away in 33 Note that this behavior is currently deprecated and may go away in
34 future. Please use notification via netlink socket instead. 34 future. Please use notification via netlink socket instead.
35 35
36config QUOTA_DEBUG
37 bool "Additional quota sanity checks"
38 depends on QUOTA
39 default n
40 help
41 If you say Y here, quota subsystem will perform some additional
42 sanity checks of quota internal structures. If unsure, say N.
43
36# Generic support for tree structured quota files. Selected when needed. 44# Generic support for tree structured quota files. Selected when needed.
37config QUOTA_TREE 45config QUOTA_TREE
38 tristate 46 tristate
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index e0b870f4749f..12c233da1b6b 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -80,11 +80,9 @@
80 80
81#include <asm/uaccess.h> 81#include <asm/uaccess.h>
82 82
83#define __DQUOT_PARANOIA
84
85/* 83/*
86 * There are three quota SMP locks. dq_list_lock protects all lists with quotas 84 * There are three quota SMP locks. dq_list_lock protects all lists with quotas
87 * and quota formats, dqstats structure containing statistics about the lists 85 * and quota formats.
88 * dq_data_lock protects data from dq_dqb and also mem_dqinfo structures and 86 * dq_data_lock protects data from dq_dqb and also mem_dqinfo structures and
89 * also guards consistency of dquot->dq_dqb with inode->i_blocks, i_bytes. 87 * also guards consistency of dquot->dq_dqb with inode->i_blocks, i_bytes.
90 * i_blocks and i_bytes updates itself are guarded by i_lock acquired directly 88 * i_blocks and i_bytes updates itself are guarded by i_lock acquired directly
@@ -134,7 +132,9 @@ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_state_lock);
134__cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_data_lock); 132__cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_data_lock);
135EXPORT_SYMBOL(dq_data_lock); 133EXPORT_SYMBOL(dq_data_lock);
136 134
135#if defined(CONFIG_QUOTA_DEBUG) || defined(CONFIG_PRINT_QUOTA_WARNING)
137static char *quotatypes[] = INITQFNAMES; 136static char *quotatypes[] = INITQFNAMES;
137#endif
138static struct quota_format_type *quota_formats; /* List of registered formats */ 138static struct quota_format_type *quota_formats; /* List of registered formats */
139static struct quota_module_name module_names[] = INIT_QUOTA_MODULE_NAMES; 139static struct quota_module_name module_names[] = INIT_QUOTA_MODULE_NAMES;
140 140
@@ -275,7 +275,7 @@ static struct dquot *find_dquot(unsigned int hashent, struct super_block *sb,
275static inline void put_dquot_last(struct dquot *dquot) 275static inline void put_dquot_last(struct dquot *dquot)
276{ 276{
277 list_add_tail(&dquot->dq_free, &free_dquots); 277 list_add_tail(&dquot->dq_free, &free_dquots);
278 dqstats.free_dquots++; 278 dqstats_inc(DQST_FREE_DQUOTS);
279} 279}
280 280
281static inline void remove_free_dquot(struct dquot *dquot) 281static inline void remove_free_dquot(struct dquot *dquot)
@@ -283,7 +283,7 @@ static inline void remove_free_dquot(struct dquot *dquot)
283 if (list_empty(&dquot->dq_free)) 283 if (list_empty(&dquot->dq_free))
284 return; 284 return;
285 list_del_init(&dquot->dq_free); 285 list_del_init(&dquot->dq_free);
286 dqstats.free_dquots--; 286 dqstats_dec(DQST_FREE_DQUOTS);
287} 287}
288 288
289static inline void put_inuse(struct dquot *dquot) 289static inline void put_inuse(struct dquot *dquot)
@@ -291,12 +291,12 @@ static inline void put_inuse(struct dquot *dquot)
291 /* We add to the back of inuse list so we don't have to restart 291 /* We add to the back of inuse list so we don't have to restart
292 * when traversing this list and we block */ 292 * when traversing this list and we block */
293 list_add_tail(&dquot->dq_inuse, &inuse_list); 293 list_add_tail(&dquot->dq_inuse, &inuse_list);
294 dqstats.allocated_dquots++; 294 dqstats_inc(DQST_ALLOC_DQUOTS);
295} 295}
296 296
297static inline void remove_inuse(struct dquot *dquot) 297static inline void remove_inuse(struct dquot *dquot)
298{ 298{
299 dqstats.allocated_dquots--; 299 dqstats_dec(DQST_ALLOC_DQUOTS);
300 list_del(&dquot->dq_inuse); 300 list_del(&dquot->dq_inuse);
301} 301}
302/* 302/*
@@ -319,14 +319,23 @@ static inline int mark_dquot_dirty(struct dquot *dquot)
319 return dquot->dq_sb->dq_op->mark_dirty(dquot); 319 return dquot->dq_sb->dq_op->mark_dirty(dquot);
320} 320}
321 321
322/* Mark dquot dirty in atomic manner, and return it's old dirty flag state */
322int dquot_mark_dquot_dirty(struct dquot *dquot) 323int dquot_mark_dquot_dirty(struct dquot *dquot)
323{ 324{
325 int ret = 1;
326
327 /* If quota is dirty already, we don't have to acquire dq_list_lock */
328 if (test_bit(DQ_MOD_B, &dquot->dq_flags))
329 return 1;
330
324 spin_lock(&dq_list_lock); 331 spin_lock(&dq_list_lock);
325 if (!test_and_set_bit(DQ_MOD_B, &dquot->dq_flags)) 332 if (!test_and_set_bit(DQ_MOD_B, &dquot->dq_flags)) {
326 list_add(&dquot->dq_dirty, &sb_dqopt(dquot->dq_sb)-> 333 list_add(&dquot->dq_dirty, &sb_dqopt(dquot->dq_sb)->
327 info[dquot->dq_type].dqi_dirty_list); 334 info[dquot->dq_type].dqi_dirty_list);
335 ret = 0;
336 }
328 spin_unlock(&dq_list_lock); 337 spin_unlock(&dq_list_lock);
329 return 0; 338 return ret;
330} 339}
331EXPORT_SYMBOL(dquot_mark_dquot_dirty); 340EXPORT_SYMBOL(dquot_mark_dquot_dirty);
332 341
@@ -552,8 +561,8 @@ int dquot_scan_active(struct super_block *sb,
552 continue; 561 continue;
553 /* Now we have active dquot so we can just increase use count */ 562 /* Now we have active dquot so we can just increase use count */
554 atomic_inc(&dquot->dq_count); 563 atomic_inc(&dquot->dq_count);
555 dqstats.lookups++;
556 spin_unlock(&dq_list_lock); 564 spin_unlock(&dq_list_lock);
565 dqstats_inc(DQST_LOOKUPS);
557 dqput(old_dquot); 566 dqput(old_dquot);
558 old_dquot = dquot; 567 old_dquot = dquot;
559 ret = fn(dquot, priv); 568 ret = fn(dquot, priv);
@@ -571,7 +580,7 @@ out:
571} 580}
572EXPORT_SYMBOL(dquot_scan_active); 581EXPORT_SYMBOL(dquot_scan_active);
573 582
574int vfs_quota_sync(struct super_block *sb, int type, int wait) 583int dquot_quota_sync(struct super_block *sb, int type, int wait)
575{ 584{
576 struct list_head *dirty; 585 struct list_head *dirty;
577 struct dquot *dquot; 586 struct dquot *dquot;
@@ -598,8 +607,8 @@ int vfs_quota_sync(struct super_block *sb, int type, int wait)
598 * holding reference so we can safely just increase 607 * holding reference so we can safely just increase
599 * use count */ 608 * use count */
600 atomic_inc(&dquot->dq_count); 609 atomic_inc(&dquot->dq_count);
601 dqstats.lookups++;
602 spin_unlock(&dq_list_lock); 610 spin_unlock(&dq_list_lock);
611 dqstats_inc(DQST_LOOKUPS);
603 sb->dq_op->write_dquot(dquot); 612 sb->dq_op->write_dquot(dquot);
604 dqput(dquot); 613 dqput(dquot);
605 spin_lock(&dq_list_lock); 614 spin_lock(&dq_list_lock);
@@ -611,9 +620,7 @@ int vfs_quota_sync(struct super_block *sb, int type, int wait)
611 if ((cnt == type || type == -1) && sb_has_quota_active(sb, cnt) 620 if ((cnt == type || type == -1) && sb_has_quota_active(sb, cnt)
612 && info_dirty(&dqopt->info[cnt])) 621 && info_dirty(&dqopt->info[cnt]))
613 sb->dq_op->write_info(sb, cnt); 622 sb->dq_op->write_info(sb, cnt);
614 spin_lock(&dq_list_lock); 623 dqstats_inc(DQST_SYNCS);
615 dqstats.syncs++;
616 spin_unlock(&dq_list_lock);
617 mutex_unlock(&dqopt->dqonoff_mutex); 624 mutex_unlock(&dqopt->dqonoff_mutex);
618 625
619 if (!wait || (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE)) 626 if (!wait || (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE))
@@ -645,7 +652,7 @@ int vfs_quota_sync(struct super_block *sb, int type, int wait)
645 652
646 return 0; 653 return 0;
647} 654}
648EXPORT_SYMBOL(vfs_quota_sync); 655EXPORT_SYMBOL(dquot_quota_sync);
649 656
650/* Free unused dquots from cache */ 657/* Free unused dquots from cache */
651static void prune_dqcache(int count) 658static void prune_dqcache(int count)
@@ -669,7 +676,6 @@ static void prune_dqcache(int count)
669 * This is called from kswapd when we think we need some 676 * This is called from kswapd when we think we need some
670 * more memory 677 * more memory
671 */ 678 */
672
673static int shrink_dqcache_memory(int nr, gfp_t gfp_mask) 679static int shrink_dqcache_memory(int nr, gfp_t gfp_mask)
674{ 680{
675 if (nr) { 681 if (nr) {
@@ -677,7 +683,9 @@ static int shrink_dqcache_memory(int nr, gfp_t gfp_mask)
677 prune_dqcache(nr); 683 prune_dqcache(nr);
678 spin_unlock(&dq_list_lock); 684 spin_unlock(&dq_list_lock);
679 } 685 }
680 return (dqstats.free_dquots / 100) * sysctl_vfs_cache_pressure; 686 return ((unsigned)
687 percpu_counter_read_positive(&dqstats.counter[DQST_FREE_DQUOTS])
688 /100) * sysctl_vfs_cache_pressure;
681} 689}
682 690
683static struct shrinker dqcache_shrinker = { 691static struct shrinker dqcache_shrinker = {
@@ -695,7 +703,7 @@ void dqput(struct dquot *dquot)
695 703
696 if (!dquot) 704 if (!dquot)
697 return; 705 return;
698#ifdef __DQUOT_PARANOIA 706#ifdef CONFIG_QUOTA_DEBUG
699 if (!atomic_read(&dquot->dq_count)) { 707 if (!atomic_read(&dquot->dq_count)) {
700 printk("VFS: dqput: trying to free free dquot\n"); 708 printk("VFS: dqput: trying to free free dquot\n");
701 printk("VFS: device %s, dquot of %s %d\n", 709 printk("VFS: device %s, dquot of %s %d\n",
@@ -705,10 +713,7 @@ void dqput(struct dquot *dquot)
705 BUG(); 713 BUG();
706 } 714 }
707#endif 715#endif
708 716 dqstats_inc(DQST_DROPS);
709 spin_lock(&dq_list_lock);
710 dqstats.drops++;
711 spin_unlock(&dq_list_lock);
712we_slept: 717we_slept:
713 spin_lock(&dq_list_lock); 718 spin_lock(&dq_list_lock);
714 if (atomic_read(&dquot->dq_count) > 1) { 719 if (atomic_read(&dquot->dq_count) > 1) {
@@ -748,7 +753,7 @@ we_slept:
748 goto we_slept; 753 goto we_slept;
749 } 754 }
750 atomic_dec(&dquot->dq_count); 755 atomic_dec(&dquot->dq_count);
751#ifdef __DQUOT_PARANOIA 756#ifdef CONFIG_QUOTA_DEBUG
752 /* sanity check */ 757 /* sanity check */
753 BUG_ON(!list_empty(&dquot->dq_free)); 758 BUG_ON(!list_empty(&dquot->dq_free));
754#endif 759#endif
@@ -825,15 +830,15 @@ we_slept:
825 put_inuse(dquot); 830 put_inuse(dquot);
826 /* hash it first so it can be found */ 831 /* hash it first so it can be found */
827 insert_dquot_hash(dquot); 832 insert_dquot_hash(dquot);
828 dqstats.lookups++;
829 spin_unlock(&dq_list_lock); 833 spin_unlock(&dq_list_lock);
834 dqstats_inc(DQST_LOOKUPS);
830 } else { 835 } else {
831 if (!atomic_read(&dquot->dq_count)) 836 if (!atomic_read(&dquot->dq_count))
832 remove_free_dquot(dquot); 837 remove_free_dquot(dquot);
833 atomic_inc(&dquot->dq_count); 838 atomic_inc(&dquot->dq_count);
834 dqstats.cache_hits++;
835 dqstats.lookups++;
836 spin_unlock(&dq_list_lock); 839 spin_unlock(&dq_list_lock);
840 dqstats_inc(DQST_CACHE_HITS);
841 dqstats_inc(DQST_LOOKUPS);
837 } 842 }
838 /* Wait for dq_lock - after this we know that either dquot_release() is 843 /* Wait for dq_lock - after this we know that either dquot_release() is
839 * already finished or it will be canceled due to dq_count > 1 test */ 844 * already finished or it will be canceled due to dq_count > 1 test */
@@ -845,7 +850,7 @@ we_slept:
845 dquot = NULL; 850 dquot = NULL;
846 goto out; 851 goto out;
847 } 852 }
848#ifdef __DQUOT_PARANOIA 853#ifdef CONFIG_QUOTA_DEBUG
849 BUG_ON(!dquot->dq_sb); /* Has somebody invalidated entry under us? */ 854 BUG_ON(!dquot->dq_sb); /* Has somebody invalidated entry under us? */
850#endif 855#endif
851out: 856out:
@@ -874,14 +879,18 @@ static int dqinit_needed(struct inode *inode, int type)
874static void add_dquot_ref(struct super_block *sb, int type) 879static void add_dquot_ref(struct super_block *sb, int type)
875{ 880{
876 struct inode *inode, *old_inode = NULL; 881 struct inode *inode, *old_inode = NULL;
882#ifdef CONFIG_QUOTA_DEBUG
877 int reserved = 0; 883 int reserved = 0;
884#endif
878 885
879 spin_lock(&inode_lock); 886 spin_lock(&inode_lock);
880 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 887 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
881 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW)) 888 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
882 continue; 889 continue;
890#ifdef CONFIG_QUOTA_DEBUG
883 if (unlikely(inode_get_rsv_space(inode) > 0)) 891 if (unlikely(inode_get_rsv_space(inode) > 0))
884 reserved = 1; 892 reserved = 1;
893#endif
885 if (!atomic_read(&inode->i_writecount)) 894 if (!atomic_read(&inode->i_writecount))
886 continue; 895 continue;
887 if (!dqinit_needed(inode, type)) 896 if (!dqinit_needed(inode, type))
@@ -903,11 +912,13 @@ static void add_dquot_ref(struct super_block *sb, int type)
903 spin_unlock(&inode_lock); 912 spin_unlock(&inode_lock);
904 iput(old_inode); 913 iput(old_inode);
905 914
915#ifdef CONFIG_QUOTA_DEBUG
906 if (reserved) { 916 if (reserved) {
907 printk(KERN_WARNING "VFS (%s): Writes happened before quota" 917 printk(KERN_WARNING "VFS (%s): Writes happened before quota"
908 " was turned on thus quota information is probably " 918 " was turned on thus quota information is probably "
909 "inconsistent. Please run quotacheck(8).\n", sb->s_id); 919 "inconsistent. Please run quotacheck(8).\n", sb->s_id);
910 } 920 }
921#endif
911} 922}
912 923
913/* 924/*
@@ -934,7 +945,7 @@ static int remove_inode_dquot_ref(struct inode *inode, int type,
934 inode->i_dquot[type] = NULL; 945 inode->i_dquot[type] = NULL;
935 if (dquot) { 946 if (dquot) {
936 if (dqput_blocks(dquot)) { 947 if (dqput_blocks(dquot)) {
937#ifdef __DQUOT_PARANOIA 948#ifdef CONFIG_QUOTA_DEBUG
938 if (atomic_read(&dquot->dq_count) != 1) 949 if (atomic_read(&dquot->dq_count) != 1)
939 printk(KERN_WARNING "VFS: Adding dquot with dq_count %d to dispose list.\n", atomic_read(&dquot->dq_count)); 950 printk(KERN_WARNING "VFS: Adding dquot with dq_count %d to dispose list.\n", atomic_read(&dquot->dq_count));
940#endif 951#endif
@@ -1484,11 +1495,13 @@ static void inode_decr_space(struct inode *inode, qsize_t number, int reserve)
1484/* 1495/*
1485 * This operation can block, but only after everything is updated 1496 * This operation can block, but only after everything is updated
1486 */ 1497 */
1487int __dquot_alloc_space(struct inode *inode, qsize_t number, 1498int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
1488 int warn, int reserve)
1489{ 1499{
1490 int cnt, ret = 0; 1500 int cnt, ret = 0;
1491 char warntype[MAXQUOTAS]; 1501 char warntype[MAXQUOTAS];
1502 int warn = flags & DQUOT_SPACE_WARN;
1503 int reserve = flags & DQUOT_SPACE_RESERVE;
1504 int nofail = flags & DQUOT_SPACE_NOFAIL;
1492 1505
1493 /* 1506 /*
1494 * First test before acquiring mutex - solves deadlocks when we 1507 * First test before acquiring mutex - solves deadlocks when we
@@ -1509,7 +1522,7 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number,
1509 continue; 1522 continue;
1510 ret = check_bdq(inode->i_dquot[cnt], number, !warn, 1523 ret = check_bdq(inode->i_dquot[cnt], number, !warn,
1511 warntype+cnt); 1524 warntype+cnt);
1512 if (ret) { 1525 if (ret && !nofail) {
1513 spin_unlock(&dq_data_lock); 1526 spin_unlock(&dq_data_lock);
1514 goto out_flush_warn; 1527 goto out_flush_warn;
1515 } 1528 }
@@ -1608,10 +1621,11 @@ EXPORT_SYMBOL(dquot_claim_space_nodirty);
1608/* 1621/*
1609 * This operation can block, but only after everything is updated 1622 * This operation can block, but only after everything is updated
1610 */ 1623 */
1611void __dquot_free_space(struct inode *inode, qsize_t number, int reserve) 1624void __dquot_free_space(struct inode *inode, qsize_t number, int flags)
1612{ 1625{
1613 unsigned int cnt; 1626 unsigned int cnt;
1614 char warntype[MAXQUOTAS]; 1627 char warntype[MAXQUOTAS];
1628 int reserve = flags & DQUOT_SPACE_RESERVE;
1615 1629
1616 /* First test before acquiring mutex - solves deadlocks when we 1630 /* First test before acquiring mutex - solves deadlocks when we
1617 * re-enter the quota code and are already holding the mutex */ 1631 * re-enter the quota code and are already holding the mutex */
@@ -1673,16 +1687,19 @@ EXPORT_SYMBOL(dquot_free_inode);
1673 1687
1674/* 1688/*
1675 * Transfer the number of inode and blocks from one diskquota to an other. 1689 * Transfer the number of inode and blocks from one diskquota to an other.
1690 * On success, dquot references in transfer_to are consumed and references
1691 * to original dquots that need to be released are placed there. On failure,
1692 * references are kept untouched.
1676 * 1693 *
1677 * This operation can block, but only after everything is updated 1694 * This operation can block, but only after everything is updated
1678 * A transaction must be started when entering this function. 1695 * A transaction must be started when entering this function.
1696 *
1679 */ 1697 */
1680static int __dquot_transfer(struct inode *inode, qid_t *chid, unsigned long mask) 1698int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
1681{ 1699{
1682 qsize_t space, cur_space; 1700 qsize_t space, cur_space;
1683 qsize_t rsv_space = 0; 1701 qsize_t rsv_space = 0;
1684 struct dquot *transfer_from[MAXQUOTAS]; 1702 struct dquot *transfer_from[MAXQUOTAS] = {};
1685 struct dquot *transfer_to[MAXQUOTAS];
1686 int cnt, ret = 0; 1703 int cnt, ret = 0;
1687 char warntype_to[MAXQUOTAS]; 1704 char warntype_to[MAXQUOTAS];
1688 char warntype_from_inodes[MAXQUOTAS], warntype_from_space[MAXQUOTAS]; 1705 char warntype_from_inodes[MAXQUOTAS], warntype_from_space[MAXQUOTAS];
@@ -1692,19 +1709,12 @@ static int __dquot_transfer(struct inode *inode, qid_t *chid, unsigned long mask
1692 if (IS_NOQUOTA(inode)) 1709 if (IS_NOQUOTA(inode))
1693 return 0; 1710 return 0;
1694 /* Initialize the arrays */ 1711 /* Initialize the arrays */
1695 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1712 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
1696 transfer_from[cnt] = NULL;
1697 transfer_to[cnt] = NULL;
1698 warntype_to[cnt] = QUOTA_NL_NOWARN; 1713 warntype_to[cnt] = QUOTA_NL_NOWARN;
1699 }
1700 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1701 if (mask & (1 << cnt))
1702 transfer_to[cnt] = dqget(inode->i_sb, chid[cnt], cnt);
1703 }
1704 down_write(&sb_dqopt(inode->i_sb)->dqptr_sem); 1714 down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
1705 if (IS_NOQUOTA(inode)) { /* File without quota accounting? */ 1715 if (IS_NOQUOTA(inode)) { /* File without quota accounting? */
1706 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); 1716 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
1707 goto put_all; 1717 return 0;
1708 } 1718 }
1709 spin_lock(&dq_data_lock); 1719 spin_lock(&dq_data_lock);
1710 cur_space = inode_get_bytes(inode); 1720 cur_space = inode_get_bytes(inode);
@@ -1756,47 +1766,41 @@ static int __dquot_transfer(struct inode *inode, qid_t *chid, unsigned long mask
1756 1766
1757 mark_all_dquot_dirty(transfer_from); 1767 mark_all_dquot_dirty(transfer_from);
1758 mark_all_dquot_dirty(transfer_to); 1768 mark_all_dquot_dirty(transfer_to);
1759 /* The reference we got is transferred to the inode */ 1769 /* Pass back references to put */
1760 for (cnt = 0; cnt < MAXQUOTAS; cnt++) 1770 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
1761 transfer_to[cnt] = NULL; 1771 transfer_to[cnt] = transfer_from[cnt];
1762warn_put_all: 1772warn:
1763 flush_warnings(transfer_to, warntype_to); 1773 flush_warnings(transfer_to, warntype_to);
1764 flush_warnings(transfer_from, warntype_from_inodes); 1774 flush_warnings(transfer_from, warntype_from_inodes);
1765 flush_warnings(transfer_from, warntype_from_space); 1775 flush_warnings(transfer_from, warntype_from_space);
1766put_all:
1767 dqput_all(transfer_from);
1768 dqput_all(transfer_to);
1769 return ret; 1776 return ret;
1770over_quota: 1777over_quota:
1771 spin_unlock(&dq_data_lock); 1778 spin_unlock(&dq_data_lock);
1772 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); 1779 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
1773 /* Clear dquot pointers we don't want to dqput() */ 1780 goto warn;
1774 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
1775 transfer_from[cnt] = NULL;
1776 goto warn_put_all;
1777} 1781}
1782EXPORT_SYMBOL(__dquot_transfer);
1778 1783
1779/* Wrapper for transferring ownership of an inode for uid/gid only 1784/* Wrapper for transferring ownership of an inode for uid/gid only
1780 * Called from FSXXX_setattr() 1785 * Called from FSXXX_setattr()
1781 */ 1786 */
1782int dquot_transfer(struct inode *inode, struct iattr *iattr) 1787int dquot_transfer(struct inode *inode, struct iattr *iattr)
1783{ 1788{
1784 qid_t chid[MAXQUOTAS]; 1789 struct dquot *transfer_to[MAXQUOTAS] = {};
1785 unsigned long mask = 0; 1790 struct super_block *sb = inode->i_sb;
1791 int ret;
1786 1792
1787 if (iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) { 1793 if (!sb_any_quota_active(sb) || IS_NOQUOTA(inode))
1788 mask |= 1 << USRQUOTA; 1794 return 0;
1789 chid[USRQUOTA] = iattr->ia_uid; 1795
1790 } 1796 if (iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid)
1791 if (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid) { 1797 transfer_to[USRQUOTA] = dqget(sb, iattr->ia_uid, USRQUOTA);
1792 mask |= 1 << GRPQUOTA; 1798 if (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)
1793 chid[GRPQUOTA] = iattr->ia_gid; 1799 transfer_to[GRPQUOTA] = dqget(sb, iattr->ia_gid, GRPQUOTA);
1794 } 1800
1795 if (sb_any_quota_active(inode->i_sb) && !IS_NOQUOTA(inode)) { 1801 ret = __dquot_transfer(inode, transfer_to);
1796 dquot_initialize(inode); 1802 dqput_all(transfer_to);
1797 return __dquot_transfer(inode, chid, mask); 1803 return ret;
1798 }
1799 return 0;
1800} 1804}
1801EXPORT_SYMBOL(dquot_transfer); 1805EXPORT_SYMBOL(dquot_transfer);
1802 1806
@@ -1827,6 +1831,7 @@ const struct dquot_operations dquot_operations = {
1827 .alloc_dquot = dquot_alloc, 1831 .alloc_dquot = dquot_alloc,
1828 .destroy_dquot = dquot_destroy, 1832 .destroy_dquot = dquot_destroy,
1829}; 1833};
1834EXPORT_SYMBOL(dquot_operations);
1830 1835
1831/* 1836/*
1832 * Generic helper for ->open on filesystems supporting disk quotas. 1837 * Generic helper for ->open on filesystems supporting disk quotas.
@@ -1845,7 +1850,7 @@ EXPORT_SYMBOL(dquot_file_open);
1845/* 1850/*
1846 * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount) 1851 * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount)
1847 */ 1852 */
1848int vfs_quota_disable(struct super_block *sb, int type, unsigned int flags) 1853int dquot_disable(struct super_block *sb, int type, unsigned int flags)
1849{ 1854{
1850 int cnt, ret = 0; 1855 int cnt, ret = 0;
1851 struct quota_info *dqopt = sb_dqopt(sb); 1856 struct quota_info *dqopt = sb_dqopt(sb);
@@ -1975,14 +1980,15 @@ put_inodes:
1975 } 1980 }
1976 return ret; 1981 return ret;
1977} 1982}
1978EXPORT_SYMBOL(vfs_quota_disable); 1983EXPORT_SYMBOL(dquot_disable);
1979 1984
1980int vfs_quota_off(struct super_block *sb, int type, int remount) 1985int dquot_quota_off(struct super_block *sb, int type)
1981{ 1986{
1982 return vfs_quota_disable(sb, type, remount ? DQUOT_SUSPENDED : 1987 return dquot_disable(sb, type,
1983 (DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED)); 1988 DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
1984} 1989}
1985EXPORT_SYMBOL(vfs_quota_off); 1990EXPORT_SYMBOL(dquot_quota_off);
1991
1986/* 1992/*
1987 * Turn quotas on on a device 1993 * Turn quotas on on a device
1988 */ 1994 */
@@ -2100,36 +2106,43 @@ out_fmt:
2100} 2106}
2101 2107
2102/* Reenable quotas on remount RW */ 2108/* Reenable quotas on remount RW */
2103static int vfs_quota_on_remount(struct super_block *sb, int type) 2109int dquot_resume(struct super_block *sb, int type)
2104{ 2110{
2105 struct quota_info *dqopt = sb_dqopt(sb); 2111 struct quota_info *dqopt = sb_dqopt(sb);
2106 struct inode *inode; 2112 struct inode *inode;
2107 int ret; 2113 int ret = 0, cnt;
2108 unsigned int flags; 2114 unsigned int flags;
2109 2115
2110 mutex_lock(&dqopt->dqonoff_mutex); 2116 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
2111 if (!sb_has_quota_suspended(sb, type)) { 2117 if (type != -1 && cnt != type)
2118 continue;
2119
2120 mutex_lock(&dqopt->dqonoff_mutex);
2121 if (!sb_has_quota_suspended(sb, cnt)) {
2122 mutex_unlock(&dqopt->dqonoff_mutex);
2123 continue;
2124 }
2125 inode = dqopt->files[cnt];
2126 dqopt->files[cnt] = NULL;
2127 spin_lock(&dq_state_lock);
2128 flags = dqopt->flags & dquot_state_flag(DQUOT_USAGE_ENABLED |
2129 DQUOT_LIMITS_ENABLED,
2130 cnt);
2131 dqopt->flags &= ~dquot_state_flag(DQUOT_STATE_FLAGS, cnt);
2132 spin_unlock(&dq_state_lock);
2112 mutex_unlock(&dqopt->dqonoff_mutex); 2133 mutex_unlock(&dqopt->dqonoff_mutex);
2113 return 0;
2114 }
2115 inode = dqopt->files[type];
2116 dqopt->files[type] = NULL;
2117 spin_lock(&dq_state_lock);
2118 flags = dqopt->flags & dquot_state_flag(DQUOT_USAGE_ENABLED |
2119 DQUOT_LIMITS_ENABLED, type);
2120 dqopt->flags &= ~dquot_state_flag(DQUOT_STATE_FLAGS, type);
2121 spin_unlock(&dq_state_lock);
2122 mutex_unlock(&dqopt->dqonoff_mutex);
2123 2134
2124 flags = dquot_generic_flag(flags, type); 2135 flags = dquot_generic_flag(flags, cnt);
2125 ret = vfs_load_quota_inode(inode, type, dqopt->info[type].dqi_fmt_id, 2136 ret = vfs_load_quota_inode(inode, cnt,
2126 flags); 2137 dqopt->info[cnt].dqi_fmt_id, flags);
2127 iput(inode); 2138 iput(inode);
2139 }
2128 2140
2129 return ret; 2141 return ret;
2130} 2142}
2143EXPORT_SYMBOL(dquot_resume);
2131 2144
2132int vfs_quota_on_path(struct super_block *sb, int type, int format_id, 2145int dquot_quota_on_path(struct super_block *sb, int type, int format_id,
2133 struct path *path) 2146 struct path *path)
2134{ 2147{
2135 int error = security_quota_on(path->dentry); 2148 int error = security_quota_on(path->dentry);
@@ -2144,40 +2157,36 @@ int vfs_quota_on_path(struct super_block *sb, int type, int format_id,
2144 DQUOT_LIMITS_ENABLED); 2157 DQUOT_LIMITS_ENABLED);
2145 return error; 2158 return error;
2146} 2159}
2147EXPORT_SYMBOL(vfs_quota_on_path); 2160EXPORT_SYMBOL(dquot_quota_on_path);
2148 2161
2149int vfs_quota_on(struct super_block *sb, int type, int format_id, char *name, 2162int dquot_quota_on(struct super_block *sb, int type, int format_id, char *name)
2150 int remount)
2151{ 2163{
2152 struct path path; 2164 struct path path;
2153 int error; 2165 int error;
2154 2166
2155 if (remount)
2156 return vfs_quota_on_remount(sb, type);
2157
2158 error = kern_path(name, LOOKUP_FOLLOW, &path); 2167 error = kern_path(name, LOOKUP_FOLLOW, &path);
2159 if (!error) { 2168 if (!error) {
2160 error = vfs_quota_on_path(sb, type, format_id, &path); 2169 error = dquot_quota_on_path(sb, type, format_id, &path);
2161 path_put(&path); 2170 path_put(&path);
2162 } 2171 }
2163 return error; 2172 return error;
2164} 2173}
2165EXPORT_SYMBOL(vfs_quota_on); 2174EXPORT_SYMBOL(dquot_quota_on);
2166 2175
2167/* 2176/*
2168 * More powerful function for turning on quotas allowing setting 2177 * More powerful function for turning on quotas allowing setting
2169 * of individual quota flags 2178 * of individual quota flags
2170 */ 2179 */
2171int vfs_quota_enable(struct inode *inode, int type, int format_id, 2180int dquot_enable(struct inode *inode, int type, int format_id,
2172 unsigned int flags) 2181 unsigned int flags)
2173{ 2182{
2174 int ret = 0; 2183 int ret = 0;
2175 struct super_block *sb = inode->i_sb; 2184 struct super_block *sb = inode->i_sb;
2176 struct quota_info *dqopt = sb_dqopt(sb); 2185 struct quota_info *dqopt = sb_dqopt(sb);
2177 2186
2178 /* Just unsuspend quotas? */ 2187 /* Just unsuspend quotas? */
2179 if (flags & DQUOT_SUSPENDED) 2188 BUG_ON(flags & DQUOT_SUSPENDED);
2180 return vfs_quota_on_remount(sb, type); 2189
2181 if (!flags) 2190 if (!flags)
2182 return 0; 2191 return 0;
2183 /* Just updating flags needed? */ 2192 /* Just updating flags needed? */
@@ -2209,13 +2218,13 @@ out_lock:
2209load_quota: 2218load_quota:
2210 return vfs_load_quota_inode(inode, type, format_id, flags); 2219 return vfs_load_quota_inode(inode, type, format_id, flags);
2211} 2220}
2212EXPORT_SYMBOL(vfs_quota_enable); 2221EXPORT_SYMBOL(dquot_enable);
2213 2222
2214/* 2223/*
2215 * This function is used when filesystem needs to initialize quotas 2224 * This function is used when filesystem needs to initialize quotas
2216 * during mount time. 2225 * during mount time.
2217 */ 2226 */
2218int vfs_quota_on_mount(struct super_block *sb, char *qf_name, 2227int dquot_quota_on_mount(struct super_block *sb, char *qf_name,
2219 int format_id, int type) 2228 int format_id, int type)
2220{ 2229{
2221 struct dentry *dentry; 2230 struct dentry *dentry;
@@ -2241,24 +2250,7 @@ out:
2241 dput(dentry); 2250 dput(dentry);
2242 return error; 2251 return error;
2243} 2252}
2244EXPORT_SYMBOL(vfs_quota_on_mount); 2253EXPORT_SYMBOL(dquot_quota_on_mount);
2245
2246/* Wrapper to turn on quotas when remounting rw */
2247int vfs_dq_quota_on_remount(struct super_block *sb)
2248{
2249 int cnt;
2250 int ret = 0, err;
2251
2252 if (!sb->s_qcop || !sb->s_qcop->quota_on)
2253 return -ENOSYS;
2254 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
2255 err = sb->s_qcop->quota_on(sb, cnt, 0, NULL, 1);
2256 if (err < 0 && !ret)
2257 ret = err;
2258 }
2259 return ret;
2260}
2261EXPORT_SYMBOL(vfs_dq_quota_on_remount);
2262 2254
2263static inline qsize_t qbtos(qsize_t blocks) 2255static inline qsize_t qbtos(qsize_t blocks)
2264{ 2256{
@@ -2271,25 +2263,30 @@ static inline qsize_t stoqb(qsize_t space)
2271} 2263}
2272 2264
2273/* Generic routine for getting common part of quota structure */ 2265/* Generic routine for getting common part of quota structure */
2274static void do_get_dqblk(struct dquot *dquot, struct if_dqblk *di) 2266static void do_get_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
2275{ 2267{
2276 struct mem_dqblk *dm = &dquot->dq_dqb; 2268 struct mem_dqblk *dm = &dquot->dq_dqb;
2277 2269
2270 memset(di, 0, sizeof(*di));
2271 di->d_version = FS_DQUOT_VERSION;
2272 di->d_flags = dquot->dq_type == USRQUOTA ?
2273 XFS_USER_QUOTA : XFS_GROUP_QUOTA;
2274 di->d_id = dquot->dq_id;
2275
2278 spin_lock(&dq_data_lock); 2276 spin_lock(&dq_data_lock);
2279 di->dqb_bhardlimit = stoqb(dm->dqb_bhardlimit); 2277 di->d_blk_hardlimit = stoqb(dm->dqb_bhardlimit);
2280 di->dqb_bsoftlimit = stoqb(dm->dqb_bsoftlimit); 2278 di->d_blk_softlimit = stoqb(dm->dqb_bsoftlimit);
2281 di->dqb_curspace = dm->dqb_curspace + dm->dqb_rsvspace; 2279 di->d_ino_hardlimit = dm->dqb_ihardlimit;
2282 di->dqb_ihardlimit = dm->dqb_ihardlimit; 2280 di->d_ino_softlimit = dm->dqb_isoftlimit;
2283 di->dqb_isoftlimit = dm->dqb_isoftlimit; 2281 di->d_bcount = dm->dqb_curspace + dm->dqb_rsvspace;
2284 di->dqb_curinodes = dm->dqb_curinodes; 2282 di->d_icount = dm->dqb_curinodes;
2285 di->dqb_btime = dm->dqb_btime; 2283 di->d_btimer = dm->dqb_btime;
2286 di->dqb_itime = dm->dqb_itime; 2284 di->d_itimer = dm->dqb_itime;
2287 di->dqb_valid = QIF_ALL;
2288 spin_unlock(&dq_data_lock); 2285 spin_unlock(&dq_data_lock);
2289} 2286}
2290 2287
2291int vfs_get_dqblk(struct super_block *sb, int type, qid_t id, 2288int dquot_get_dqblk(struct super_block *sb, int type, qid_t id,
2292 struct if_dqblk *di) 2289 struct fs_disk_quota *di)
2293{ 2290{
2294 struct dquot *dquot; 2291 struct dquot *dquot;
2295 2292
@@ -2301,55 +2298,74 @@ int vfs_get_dqblk(struct super_block *sb, int type, qid_t id,
2301 2298
2302 return 0; 2299 return 0;
2303} 2300}
2304EXPORT_SYMBOL(vfs_get_dqblk); 2301EXPORT_SYMBOL(dquot_get_dqblk);
2302
2303#define VFS_FS_DQ_MASK \
2304 (FS_DQ_BCOUNT | FS_DQ_BSOFT | FS_DQ_BHARD | \
2305 FS_DQ_ICOUNT | FS_DQ_ISOFT | FS_DQ_IHARD | \
2306 FS_DQ_BTIMER | FS_DQ_ITIMER)
2305 2307
2306/* Generic routine for setting common part of quota structure */ 2308/* Generic routine for setting common part of quota structure */
2307static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di) 2309static int do_set_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
2308{ 2310{
2309 struct mem_dqblk *dm = &dquot->dq_dqb; 2311 struct mem_dqblk *dm = &dquot->dq_dqb;
2310 int check_blim = 0, check_ilim = 0; 2312 int check_blim = 0, check_ilim = 0;
2311 struct mem_dqinfo *dqi = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_type]; 2313 struct mem_dqinfo *dqi = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_type];
2312 2314
2313 if ((di->dqb_valid & QIF_BLIMITS && 2315 if (di->d_fieldmask & ~VFS_FS_DQ_MASK)
2314 (di->dqb_bhardlimit > dqi->dqi_maxblimit || 2316 return -EINVAL;
2315 di->dqb_bsoftlimit > dqi->dqi_maxblimit)) || 2317
2316 (di->dqb_valid & QIF_ILIMITS && 2318 if (((di->d_fieldmask & FS_DQ_BSOFT) &&
2317 (di->dqb_ihardlimit > dqi->dqi_maxilimit || 2319 (di->d_blk_softlimit > dqi->dqi_maxblimit)) ||
2318 di->dqb_isoftlimit > dqi->dqi_maxilimit))) 2320 ((di->d_fieldmask & FS_DQ_BHARD) &&
2321 (di->d_blk_hardlimit > dqi->dqi_maxblimit)) ||
2322 ((di->d_fieldmask & FS_DQ_ISOFT) &&
2323 (di->d_ino_softlimit > dqi->dqi_maxilimit)) ||
2324 ((di->d_fieldmask & FS_DQ_IHARD) &&
2325 (di->d_ino_hardlimit > dqi->dqi_maxilimit)))
2319 return -ERANGE; 2326 return -ERANGE;
2320 2327
2321 spin_lock(&dq_data_lock); 2328 spin_lock(&dq_data_lock);
2322 if (di->dqb_valid & QIF_SPACE) { 2329 if (di->d_fieldmask & FS_DQ_BCOUNT) {
2323 dm->dqb_curspace = di->dqb_curspace - dm->dqb_rsvspace; 2330 dm->dqb_curspace = di->d_bcount - dm->dqb_rsvspace;
2324 check_blim = 1; 2331 check_blim = 1;
2325 __set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags); 2332 set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags);
2326 } 2333 }
2327 if (di->dqb_valid & QIF_BLIMITS) { 2334
2328 dm->dqb_bsoftlimit = qbtos(di->dqb_bsoftlimit); 2335 if (di->d_fieldmask & FS_DQ_BSOFT)
2329 dm->dqb_bhardlimit = qbtos(di->dqb_bhardlimit); 2336 dm->dqb_bsoftlimit = qbtos(di->d_blk_softlimit);
2337 if (di->d_fieldmask & FS_DQ_BHARD)
2338 dm->dqb_bhardlimit = qbtos(di->d_blk_hardlimit);
2339 if (di->d_fieldmask & (FS_DQ_BSOFT | FS_DQ_BHARD)) {
2330 check_blim = 1; 2340 check_blim = 1;
2331 __set_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags); 2341 set_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags);
2332 } 2342 }
2333 if (di->dqb_valid & QIF_INODES) { 2343
2334 dm->dqb_curinodes = di->dqb_curinodes; 2344 if (di->d_fieldmask & FS_DQ_ICOUNT) {
2345 dm->dqb_curinodes = di->d_icount;
2335 check_ilim = 1; 2346 check_ilim = 1;
2336 __set_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags); 2347 set_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags);
2337 } 2348 }
2338 if (di->dqb_valid & QIF_ILIMITS) { 2349
2339 dm->dqb_isoftlimit = di->dqb_isoftlimit; 2350 if (di->d_fieldmask & FS_DQ_ISOFT)
2340 dm->dqb_ihardlimit = di->dqb_ihardlimit; 2351 dm->dqb_isoftlimit = di->d_ino_softlimit;
2352 if (di->d_fieldmask & FS_DQ_IHARD)
2353 dm->dqb_ihardlimit = di->d_ino_hardlimit;
2354 if (di->d_fieldmask & (FS_DQ_ISOFT | FS_DQ_IHARD)) {
2341 check_ilim = 1; 2355 check_ilim = 1;
2342 __set_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags); 2356 set_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags);
2343 } 2357 }
2344 if (di->dqb_valid & QIF_BTIME) { 2358
2345 dm->dqb_btime = di->dqb_btime; 2359 if (di->d_fieldmask & FS_DQ_BTIMER) {
2360 dm->dqb_btime = di->d_btimer;
2346 check_blim = 1; 2361 check_blim = 1;
2347 __set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags); 2362 set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags);
2348 } 2363 }
2349 if (di->dqb_valid & QIF_ITIME) { 2364
2350 dm->dqb_itime = di->dqb_itime; 2365 if (di->d_fieldmask & FS_DQ_ITIMER) {
2366 dm->dqb_itime = di->d_itimer;
2351 check_ilim = 1; 2367 check_ilim = 1;
2352 __set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags); 2368 set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
2353 } 2369 }
2354 2370
2355 if (check_blim) { 2371 if (check_blim) {
@@ -2357,7 +2373,7 @@ static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
2357 dm->dqb_curspace < dm->dqb_bsoftlimit) { 2373 dm->dqb_curspace < dm->dqb_bsoftlimit) {
2358 dm->dqb_btime = 0; 2374 dm->dqb_btime = 0;
2359 clear_bit(DQ_BLKS_B, &dquot->dq_flags); 2375 clear_bit(DQ_BLKS_B, &dquot->dq_flags);
2360 } else if (!(di->dqb_valid & QIF_BTIME)) 2376 } else if (!(di->d_fieldmask & FS_DQ_BTIMER))
2361 /* Set grace only if user hasn't provided his own... */ 2377 /* Set grace only if user hasn't provided his own... */
2362 dm->dqb_btime = get_seconds() + dqi->dqi_bgrace; 2378 dm->dqb_btime = get_seconds() + dqi->dqi_bgrace;
2363 } 2379 }
@@ -2366,7 +2382,7 @@ static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
2366 dm->dqb_curinodes < dm->dqb_isoftlimit) { 2382 dm->dqb_curinodes < dm->dqb_isoftlimit) {
2367 dm->dqb_itime = 0; 2383 dm->dqb_itime = 0;
2368 clear_bit(DQ_INODES_B, &dquot->dq_flags); 2384 clear_bit(DQ_INODES_B, &dquot->dq_flags);
2369 } else if (!(di->dqb_valid & QIF_ITIME)) 2385 } else if (!(di->d_fieldmask & FS_DQ_ITIMER))
2370 /* Set grace only if user hasn't provided his own... */ 2386 /* Set grace only if user hasn't provided his own... */
2371 dm->dqb_itime = get_seconds() + dqi->dqi_igrace; 2387 dm->dqb_itime = get_seconds() + dqi->dqi_igrace;
2372 } 2388 }
@@ -2381,8 +2397,8 @@ static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
2381 return 0; 2397 return 0;
2382} 2398}
2383 2399
2384int vfs_set_dqblk(struct super_block *sb, int type, qid_t id, 2400int dquot_set_dqblk(struct super_block *sb, int type, qid_t id,
2385 struct if_dqblk *di) 2401 struct fs_disk_quota *di)
2386{ 2402{
2387 struct dquot *dquot; 2403 struct dquot *dquot;
2388 int rc; 2404 int rc;
@@ -2397,10 +2413,10 @@ int vfs_set_dqblk(struct super_block *sb, int type, qid_t id,
2397out: 2413out:
2398 return rc; 2414 return rc;
2399} 2415}
2400EXPORT_SYMBOL(vfs_set_dqblk); 2416EXPORT_SYMBOL(dquot_set_dqblk);
2401 2417
2402/* Generic routine for getting common part of quota file information */ 2418/* Generic routine for getting common part of quota file information */
2403int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii) 2419int dquot_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
2404{ 2420{
2405 struct mem_dqinfo *mi; 2421 struct mem_dqinfo *mi;
2406 2422
@@ -2419,10 +2435,10 @@ int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
2419 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); 2435 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
2420 return 0; 2436 return 0;
2421} 2437}
2422EXPORT_SYMBOL(vfs_get_dqinfo); 2438EXPORT_SYMBOL(dquot_get_dqinfo);
2423 2439
2424/* Generic routine for setting common part of quota file information */ 2440/* Generic routine for setting common part of quota file information */
2425int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii) 2441int dquot_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
2426{ 2442{
2427 struct mem_dqinfo *mi; 2443 struct mem_dqinfo *mi;
2428 int err = 0; 2444 int err = 0;
@@ -2449,74 +2465,86 @@ out:
2449 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); 2465 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
2450 return err; 2466 return err;
2451} 2467}
2452EXPORT_SYMBOL(vfs_set_dqinfo); 2468EXPORT_SYMBOL(dquot_set_dqinfo);
2453 2469
2454const struct quotactl_ops vfs_quotactl_ops = { 2470const struct quotactl_ops dquot_quotactl_ops = {
2455 .quota_on = vfs_quota_on, 2471 .quota_on = dquot_quota_on,
2456 .quota_off = vfs_quota_off, 2472 .quota_off = dquot_quota_off,
2457 .quota_sync = vfs_quota_sync, 2473 .quota_sync = dquot_quota_sync,
2458 .get_info = vfs_get_dqinfo, 2474 .get_info = dquot_get_dqinfo,
2459 .set_info = vfs_set_dqinfo, 2475 .set_info = dquot_set_dqinfo,
2460 .get_dqblk = vfs_get_dqblk, 2476 .get_dqblk = dquot_get_dqblk,
2461 .set_dqblk = vfs_set_dqblk 2477 .set_dqblk = dquot_set_dqblk
2462}; 2478};
2479EXPORT_SYMBOL(dquot_quotactl_ops);
2480
2481static int do_proc_dqstats(struct ctl_table *table, int write,
2482 void __user *buffer, size_t *lenp, loff_t *ppos)
2483{
2484 unsigned int type = (int *)table->data - dqstats.stat;
2485
2486 /* Update global table */
2487 dqstats.stat[type] =
2488 percpu_counter_sum_positive(&dqstats.counter[type]);
2489 return proc_dointvec(table, write, buffer, lenp, ppos);
2490}
2463 2491
2464static ctl_table fs_dqstats_table[] = { 2492static ctl_table fs_dqstats_table[] = {
2465 { 2493 {
2466 .procname = "lookups", 2494 .procname = "lookups",
2467 .data = &dqstats.lookups, 2495 .data = &dqstats.stat[DQST_LOOKUPS],
2468 .maxlen = sizeof(int), 2496 .maxlen = sizeof(int),
2469 .mode = 0444, 2497 .mode = 0444,
2470 .proc_handler = proc_dointvec, 2498 .proc_handler = do_proc_dqstats,
2471 }, 2499 },
2472 { 2500 {
2473 .procname = "drops", 2501 .procname = "drops",
2474 .data = &dqstats.drops, 2502 .data = &dqstats.stat[DQST_DROPS],
2475 .maxlen = sizeof(int), 2503 .maxlen = sizeof(int),
2476 .mode = 0444, 2504 .mode = 0444,
2477 .proc_handler = proc_dointvec, 2505 .proc_handler = do_proc_dqstats,
2478 }, 2506 },
2479 { 2507 {
2480 .procname = "reads", 2508 .procname = "reads",
2481 .data = &dqstats.reads, 2509 .data = &dqstats.stat[DQST_READS],
2482 .maxlen = sizeof(int), 2510 .maxlen = sizeof(int),
2483 .mode = 0444, 2511 .mode = 0444,
2484 .proc_handler = proc_dointvec, 2512 .proc_handler = do_proc_dqstats,
2485 }, 2513 },
2486 { 2514 {
2487 .procname = "writes", 2515 .procname = "writes",
2488 .data = &dqstats.writes, 2516 .data = &dqstats.stat[DQST_WRITES],
2489 .maxlen = sizeof(int), 2517 .maxlen = sizeof(int),
2490 .mode = 0444, 2518 .mode = 0444,
2491 .proc_handler = proc_dointvec, 2519 .proc_handler = do_proc_dqstats,
2492 }, 2520 },
2493 { 2521 {
2494 .procname = "cache_hits", 2522 .procname = "cache_hits",
2495 .data = &dqstats.cache_hits, 2523 .data = &dqstats.stat[DQST_CACHE_HITS],
2496 .maxlen = sizeof(int), 2524 .maxlen = sizeof(int),
2497 .mode = 0444, 2525 .mode = 0444,
2498 .proc_handler = proc_dointvec, 2526 .proc_handler = do_proc_dqstats,
2499 }, 2527 },
2500 { 2528 {
2501 .procname = "allocated_dquots", 2529 .procname = "allocated_dquots",
2502 .data = &dqstats.allocated_dquots, 2530 .data = &dqstats.stat[DQST_ALLOC_DQUOTS],
2503 .maxlen = sizeof(int), 2531 .maxlen = sizeof(int),
2504 .mode = 0444, 2532 .mode = 0444,
2505 .proc_handler = proc_dointvec, 2533 .proc_handler = do_proc_dqstats,
2506 }, 2534 },
2507 { 2535 {
2508 .procname = "free_dquots", 2536 .procname = "free_dquots",
2509 .data = &dqstats.free_dquots, 2537 .data = &dqstats.stat[DQST_FREE_DQUOTS],
2510 .maxlen = sizeof(int), 2538 .maxlen = sizeof(int),
2511 .mode = 0444, 2539 .mode = 0444,
2512 .proc_handler = proc_dointvec, 2540 .proc_handler = do_proc_dqstats,
2513 }, 2541 },
2514 { 2542 {
2515 .procname = "syncs", 2543 .procname = "syncs",
2516 .data = &dqstats.syncs, 2544 .data = &dqstats.stat[DQST_SYNCS],
2517 .maxlen = sizeof(int), 2545 .maxlen = sizeof(int),
2518 .mode = 0444, 2546 .mode = 0444,
2519 .proc_handler = proc_dointvec, 2547 .proc_handler = do_proc_dqstats,
2520 }, 2548 },
2521#ifdef CONFIG_PRINT_QUOTA_WARNING 2549#ifdef CONFIG_PRINT_QUOTA_WARNING
2522 { 2550 {
@@ -2550,7 +2578,7 @@ static ctl_table sys_table[] = {
2550 2578
2551static int __init dquot_init(void) 2579static int __init dquot_init(void)
2552{ 2580{
2553 int i; 2581 int i, ret;
2554 unsigned long nr_hash, order; 2582 unsigned long nr_hash, order;
2555 2583
2556 printk(KERN_NOTICE "VFS: Disk quotas %s\n", __DQUOT_VERSION__); 2584 printk(KERN_NOTICE "VFS: Disk quotas %s\n", __DQUOT_VERSION__);
@@ -2568,6 +2596,12 @@ static int __init dquot_init(void)
2568 if (!dquot_hash) 2596 if (!dquot_hash)
2569 panic("Cannot create dquot hash table"); 2597 panic("Cannot create dquot hash table");
2570 2598
2599 for (i = 0; i < _DQST_DQSTAT_LAST; i++) {
2600 ret = percpu_counter_init(&dqstats.counter[i], 0);
2601 if (ret)
2602 panic("Cannot create dquot stat counters");
2603 }
2604
2571 /* Find power-of-two hlist_heads which can fit into allocation */ 2605 /* Find power-of-two hlist_heads which can fit into allocation */
2572 nr_hash = (1UL << order) * PAGE_SIZE / sizeof(struct hlist_head); 2606 nr_hash = (1UL << order) * PAGE_SIZE / sizeof(struct hlist_head);
2573 dq_hash_bits = 0; 2607 dq_hash_bits = 0;
diff --git a/fs/quota/netlink.c b/fs/quota/netlink.c
index 2663ed90fb03..d67908b407d9 100644
--- a/fs/quota/netlink.c
+++ b/fs/quota/netlink.c
@@ -5,6 +5,7 @@
5#include <linux/kernel.h> 5#include <linux/kernel.h>
6#include <linux/quotaops.h> 6#include <linux/quotaops.h>
7#include <linux/sched.h> 7#include <linux/sched.h>
8#include <linux/slab.h>
8#include <net/netlink.h> 9#include <net/netlink.h>
9#include <net/genetlink.h> 10#include <net/genetlink.h>
10 11
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 95388f9b7356..b299961e1edb 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -45,36 +45,22 @@ static int check_quotactl_permission(struct super_block *sb, int type, int cmd,
45 return security_quotactl(cmd, type, id, sb); 45 return security_quotactl(cmd, type, id, sb);
46} 46}
47 47
48static void quota_sync_one(struct super_block *sb, void *arg)
49{
50 if (sb->s_qcop && sb->s_qcop->quota_sync)
51 sb->s_qcop->quota_sync(sb, *(int *)arg, 1);
52}
53
48static int quota_sync_all(int type) 54static int quota_sync_all(int type)
49{ 55{
50 struct super_block *sb;
51 int ret; 56 int ret;
52 57
53 if (type >= MAXQUOTAS) 58 if (type >= MAXQUOTAS)
54 return -EINVAL; 59 return -EINVAL;
55 ret = security_quotactl(Q_SYNC, type, 0, NULL); 60 ret = security_quotactl(Q_SYNC, type, 0, NULL);
56 if (ret) 61 if (!ret)
57 return ret; 62 iterate_supers(quota_sync_one, &type);
58 63 return ret;
59 spin_lock(&sb_lock);
60restart:
61 list_for_each_entry(sb, &super_blocks, s_list) {
62 if (!sb->s_qcop || !sb->s_qcop->quota_sync)
63 continue;
64
65 sb->s_count++;
66 spin_unlock(&sb_lock);
67 down_read(&sb->s_umount);
68 if (sb->s_root)
69 sb->s_qcop->quota_sync(sb, type, 1);
70 up_read(&sb->s_umount);
71 spin_lock(&sb_lock);
72 if (__put_super_and_need_restart(sb))
73 goto restart;
74 }
75 spin_unlock(&sb_lock);
76
77 return 0;
78} 64}
79 65
80static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id, 66static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id,
@@ -87,7 +73,7 @@ static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id,
87 if (IS_ERR(pathname)) 73 if (IS_ERR(pathname))
88 return PTR_ERR(pathname); 74 return PTR_ERR(pathname);
89 if (sb->s_qcop->quota_on) 75 if (sb->s_qcop->quota_on)
90 ret = sb->s_qcop->quota_on(sb, type, id, pathname, 0); 76 ret = sb->s_qcop->quota_on(sb, type, id, pathname);
91 putname(pathname); 77 putname(pathname);
92 return ret; 78 return ret;
93} 79}
@@ -113,8 +99,6 @@ static int quota_getinfo(struct super_block *sb, int type, void __user *addr)
113 struct if_dqinfo info; 99 struct if_dqinfo info;
114 int ret; 100 int ret;
115 101
116 if (!sb_has_quota_active(sb, type))
117 return -ESRCH;
118 if (!sb->s_qcop->get_info) 102 if (!sb->s_qcop->get_info)
119 return -ENOSYS; 103 return -ENOSYS;
120 ret = sb->s_qcop->get_info(sb, type, &info); 104 ret = sb->s_qcop->get_info(sb, type, &info);
@@ -129,43 +113,80 @@ static int quota_setinfo(struct super_block *sb, int type, void __user *addr)
129 113
130 if (copy_from_user(&info, addr, sizeof(info))) 114 if (copy_from_user(&info, addr, sizeof(info)))
131 return -EFAULT; 115 return -EFAULT;
132 if (!sb_has_quota_active(sb, type))
133 return -ESRCH;
134 if (!sb->s_qcop->set_info) 116 if (!sb->s_qcop->set_info)
135 return -ENOSYS; 117 return -ENOSYS;
136 return sb->s_qcop->set_info(sb, type, &info); 118 return sb->s_qcop->set_info(sb, type, &info);
137} 119}
138 120
121static void copy_to_if_dqblk(struct if_dqblk *dst, struct fs_disk_quota *src)
122{
123 dst->dqb_bhardlimit = src->d_blk_hardlimit;
124 dst->dqb_bsoftlimit = src->d_blk_softlimit;
125 dst->dqb_curspace = src->d_bcount;
126 dst->dqb_ihardlimit = src->d_ino_hardlimit;
127 dst->dqb_isoftlimit = src->d_ino_softlimit;
128 dst->dqb_curinodes = src->d_icount;
129 dst->dqb_btime = src->d_btimer;
130 dst->dqb_itime = src->d_itimer;
131 dst->dqb_valid = QIF_ALL;
132}
133
139static int quota_getquota(struct super_block *sb, int type, qid_t id, 134static int quota_getquota(struct super_block *sb, int type, qid_t id,
140 void __user *addr) 135 void __user *addr)
141{ 136{
137 struct fs_disk_quota fdq;
142 struct if_dqblk idq; 138 struct if_dqblk idq;
143 int ret; 139 int ret;
144 140
145 if (!sb_has_quota_active(sb, type))
146 return -ESRCH;
147 if (!sb->s_qcop->get_dqblk) 141 if (!sb->s_qcop->get_dqblk)
148 return -ENOSYS; 142 return -ENOSYS;
149 ret = sb->s_qcop->get_dqblk(sb, type, id, &idq); 143 ret = sb->s_qcop->get_dqblk(sb, type, id, &fdq);
150 if (ret) 144 if (ret)
151 return ret; 145 return ret;
146 copy_to_if_dqblk(&idq, &fdq);
152 if (copy_to_user(addr, &idq, sizeof(idq))) 147 if (copy_to_user(addr, &idq, sizeof(idq)))
153 return -EFAULT; 148 return -EFAULT;
154 return 0; 149 return 0;
155} 150}
156 151
152static void copy_from_if_dqblk(struct fs_disk_quota *dst, struct if_dqblk *src)
153{
154 dst->d_blk_hardlimit = src->dqb_bhardlimit;
155 dst->d_blk_softlimit = src->dqb_bsoftlimit;
156 dst->d_bcount = src->dqb_curspace;
157 dst->d_ino_hardlimit = src->dqb_ihardlimit;
158 dst->d_ino_softlimit = src->dqb_isoftlimit;
159 dst->d_icount = src->dqb_curinodes;
160 dst->d_btimer = src->dqb_btime;
161 dst->d_itimer = src->dqb_itime;
162
163 dst->d_fieldmask = 0;
164 if (src->dqb_valid & QIF_BLIMITS)
165 dst->d_fieldmask |= FS_DQ_BSOFT | FS_DQ_BHARD;
166 if (src->dqb_valid & QIF_SPACE)
167 dst->d_fieldmask |= FS_DQ_BCOUNT;
168 if (src->dqb_valid & QIF_ILIMITS)
169 dst->d_fieldmask |= FS_DQ_ISOFT | FS_DQ_IHARD;
170 if (src->dqb_valid & QIF_INODES)
171 dst->d_fieldmask |= FS_DQ_ICOUNT;
172 if (src->dqb_valid & QIF_BTIME)
173 dst->d_fieldmask |= FS_DQ_BTIMER;
174 if (src->dqb_valid & QIF_ITIME)
175 dst->d_fieldmask |= FS_DQ_ITIMER;
176}
177
157static int quota_setquota(struct super_block *sb, int type, qid_t id, 178static int quota_setquota(struct super_block *sb, int type, qid_t id,
158 void __user *addr) 179 void __user *addr)
159{ 180{
181 struct fs_disk_quota fdq;
160 struct if_dqblk idq; 182 struct if_dqblk idq;
161 183
162 if (copy_from_user(&idq, addr, sizeof(idq))) 184 if (copy_from_user(&idq, addr, sizeof(idq)))
163 return -EFAULT; 185 return -EFAULT;
164 if (!sb_has_quota_active(sb, type))
165 return -ESRCH;
166 if (!sb->s_qcop->set_dqblk) 186 if (!sb->s_qcop->set_dqblk)
167 return -ENOSYS; 187 return -ENOSYS;
168 return sb->s_qcop->set_dqblk(sb, type, id, &idq); 188 copy_from_if_dqblk(&fdq, &idq);
189 return sb->s_qcop->set_dqblk(sb, type, id, &fdq);
169} 190}
170 191
171static int quota_setxstate(struct super_block *sb, int cmd, void __user *addr) 192static int quota_setxstate(struct super_block *sb, int cmd, void __user *addr)
@@ -199,9 +220,9 @@ static int quota_setxquota(struct super_block *sb, int type, qid_t id,
199 220
200 if (copy_from_user(&fdq, addr, sizeof(fdq))) 221 if (copy_from_user(&fdq, addr, sizeof(fdq)))
201 return -EFAULT; 222 return -EFAULT;
202 if (!sb->s_qcop->set_xquota) 223 if (!sb->s_qcop->set_dqblk)
203 return -ENOSYS; 224 return -ENOSYS;
204 return sb->s_qcop->set_xquota(sb, type, id, &fdq); 225 return sb->s_qcop->set_dqblk(sb, type, id, &fdq);
205} 226}
206 227
207static int quota_getxquota(struct super_block *sb, int type, qid_t id, 228static int quota_getxquota(struct super_block *sb, int type, qid_t id,
@@ -210,9 +231,9 @@ static int quota_getxquota(struct super_block *sb, int type, qid_t id,
210 struct fs_disk_quota fdq; 231 struct fs_disk_quota fdq;
211 int ret; 232 int ret;
212 233
213 if (!sb->s_qcop->get_xquota) 234 if (!sb->s_qcop->get_dqblk)
214 return -ENOSYS; 235 return -ENOSYS;
215 ret = sb->s_qcop->get_xquota(sb, type, id, &fdq); 236 ret = sb->s_qcop->get_dqblk(sb, type, id, &fdq);
216 if (!ret && copy_to_user(addr, &fdq, sizeof(fdq))) 237 if (!ret && copy_to_user(addr, &fdq, sizeof(fdq)))
217 return -EFAULT; 238 return -EFAULT;
218 return ret; 239 return ret;
@@ -239,7 +260,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
239 case Q_QUOTAOFF: 260 case Q_QUOTAOFF:
240 if (!sb->s_qcop->quota_off) 261 if (!sb->s_qcop->quota_off)
241 return -ENOSYS; 262 return -ENOSYS;
242 return sb->s_qcop->quota_off(sb, type, 0); 263 return sb->s_qcop->quota_off(sb, type);
243 case Q_GETFMT: 264 case Q_GETFMT:
244 return quota_getfmt(sb, type, addr); 265 return quota_getfmt(sb, type, addr);
245 case Q_GETINFO: 266 case Q_GETINFO:
diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c
index f81f4bcfb178..24f03407eeb5 100644
--- a/fs/quota/quota_tree.c
+++ b/fs/quota/quota_tree.c
@@ -60,9 +60,17 @@ static ssize_t read_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf)
60static ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf) 60static ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf)
61{ 61{
62 struct super_block *sb = info->dqi_sb; 62 struct super_block *sb = info->dqi_sb;
63 ssize_t ret;
63 64
64 return sb->s_op->quota_write(sb, info->dqi_type, buf, 65 ret = sb->s_op->quota_write(sb, info->dqi_type, buf,
65 info->dqi_usable_bs, blk << info->dqi_blocksize_bits); 66 info->dqi_usable_bs, blk << info->dqi_blocksize_bits);
67 if (ret != info->dqi_usable_bs) {
68 q_warn(KERN_WARNING "VFS: dquota write failed on "
69 "dev %s\n", sb->s_id);
70 if (ret >= 0)
71 ret = -EIO;
72 }
73 return ret;
66} 74}
67 75
68/* Remove empty block from list and return it */ 76/* Remove empty block from list and return it */
@@ -152,7 +160,7 @@ static int remove_free_dqentry(struct qtree_mem_dqinfo *info, char *buf,
152 dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0); 160 dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0);
153 /* No matter whether write succeeds block is out of list */ 161 /* No matter whether write succeeds block is out of list */
154 if (write_blk(info, blk, buf) < 0) 162 if (write_blk(info, blk, buf) < 0)
155 printk(KERN_ERR 163 q_warn(KERN_ERR
156 "VFS: Can't write block (%u) with free entries.\n", 164 "VFS: Can't write block (%u) with free entries.\n",
157 blk); 165 blk);
158 return 0; 166 return 0;
@@ -244,7 +252,7 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info,
244 if (le16_to_cpu(dh->dqdh_entries) + 1 >= qtree_dqstr_in_blk(info)) { 252 if (le16_to_cpu(dh->dqdh_entries) + 1 >= qtree_dqstr_in_blk(info)) {
245 *err = remove_free_dqentry(info, buf, blk); 253 *err = remove_free_dqentry(info, buf, blk);
246 if (*err < 0) { 254 if (*err < 0) {
247 printk(KERN_ERR "VFS: find_free_dqentry(): Can't " 255 q_warn(KERN_ERR "VFS: find_free_dqentry(): Can't "
248 "remove block (%u) from entry free list.\n", 256 "remove block (%u) from entry free list.\n",
249 blk); 257 blk);
250 goto out_buf; 258 goto out_buf;
@@ -268,7 +276,7 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info,
268#endif 276#endif
269 *err = write_blk(info, blk, buf); 277 *err = write_blk(info, blk, buf);
270 if (*err < 0) { 278 if (*err < 0) {
271 printk(KERN_ERR "VFS: find_free_dqentry(): Can't write quota " 279 q_warn(KERN_ERR "VFS: find_free_dqentry(): Can't write quota "
272 "data block %u.\n", blk); 280 "data block %u.\n", blk);
273 goto out_buf; 281 goto out_buf;
274 } 282 }
@@ -303,7 +311,7 @@ static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
303 } else { 311 } else {
304 ret = read_blk(info, *treeblk, buf); 312 ret = read_blk(info, *treeblk, buf);
305 if (ret < 0) { 313 if (ret < 0) {
306 printk(KERN_ERR "VFS: Can't read tree quota block " 314 q_warn(KERN_ERR "VFS: Can't read tree quota block "
307 "%u.\n", *treeblk); 315 "%u.\n", *treeblk);
308 goto out_buf; 316 goto out_buf;
309 } 317 }
@@ -365,7 +373,7 @@ int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
365 if (!dquot->dq_off) { 373 if (!dquot->dq_off) {
366 ret = dq_insert_tree(info, dquot); 374 ret = dq_insert_tree(info, dquot);
367 if (ret < 0) { 375 if (ret < 0) {
368 printk(KERN_ERR "VFS: Error %zd occurred while " 376 q_warn(KERN_ERR "VFS: Error %zd occurred while "
369 "creating quota.\n", ret); 377 "creating quota.\n", ret);
370 kfree(ddquot); 378 kfree(ddquot);
371 return ret; 379 return ret;
@@ -377,14 +385,14 @@ int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
377 ret = sb->s_op->quota_write(sb, type, ddquot, info->dqi_entry_size, 385 ret = sb->s_op->quota_write(sb, type, ddquot, info->dqi_entry_size,
378 dquot->dq_off); 386 dquot->dq_off);
379 if (ret != info->dqi_entry_size) { 387 if (ret != info->dqi_entry_size) {
380 printk(KERN_WARNING "VFS: dquota write failed on dev %s\n", 388 q_warn(KERN_WARNING "VFS: dquota write failed on dev %s\n",
381 sb->s_id); 389 sb->s_id);
382 if (ret >= 0) 390 if (ret >= 0)
383 ret = -ENOSPC; 391 ret = -ENOSPC;
384 } else { 392 } else {
385 ret = 0; 393 ret = 0;
386 } 394 }
387 dqstats.writes++; 395 dqstats_inc(DQST_WRITES);
388 kfree(ddquot); 396 kfree(ddquot);
389 397
390 return ret; 398 return ret;
@@ -402,14 +410,14 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
402 if (!buf) 410 if (!buf)
403 return -ENOMEM; 411 return -ENOMEM;
404 if (dquot->dq_off >> info->dqi_blocksize_bits != blk) { 412 if (dquot->dq_off >> info->dqi_blocksize_bits != blk) {
405 printk(KERN_ERR "VFS: Quota structure has offset to other " 413 q_warn(KERN_ERR "VFS: Quota structure has offset to other "
406 "block (%u) than it should (%u).\n", blk, 414 "block (%u) than it should (%u).\n", blk,
407 (uint)(dquot->dq_off >> info->dqi_blocksize_bits)); 415 (uint)(dquot->dq_off >> info->dqi_blocksize_bits));
408 goto out_buf; 416 goto out_buf;
409 } 417 }
410 ret = read_blk(info, blk, buf); 418 ret = read_blk(info, blk, buf);
411 if (ret < 0) { 419 if (ret < 0) {
412 printk(KERN_ERR "VFS: Can't read quota data block %u\n", blk); 420 q_warn(KERN_ERR "VFS: Can't read quota data block %u\n", blk);
413 goto out_buf; 421 goto out_buf;
414 } 422 }
415 dh = (struct qt_disk_dqdbheader *)buf; 423 dh = (struct qt_disk_dqdbheader *)buf;
@@ -419,7 +427,7 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
419 if (ret >= 0) 427 if (ret >= 0)
420 ret = put_free_dqblk(info, buf, blk); 428 ret = put_free_dqblk(info, buf, blk);
421 if (ret < 0) { 429 if (ret < 0) {
422 printk(KERN_ERR "VFS: Can't move quota data block (%u) " 430 q_warn(KERN_ERR "VFS: Can't move quota data block (%u) "
423 "to free list.\n", blk); 431 "to free list.\n", blk);
424 goto out_buf; 432 goto out_buf;
425 } 433 }
@@ -432,14 +440,14 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
432 /* Insert will write block itself */ 440 /* Insert will write block itself */
433 ret = insert_free_dqentry(info, buf, blk); 441 ret = insert_free_dqentry(info, buf, blk);
434 if (ret < 0) { 442 if (ret < 0) {
435 printk(KERN_ERR "VFS: Can't insert quota data " 443 q_warn(KERN_ERR "VFS: Can't insert quota data "
436 "block (%u) to free entry list.\n", blk); 444 "block (%u) to free entry list.\n", blk);
437 goto out_buf; 445 goto out_buf;
438 } 446 }
439 } else { 447 } else {
440 ret = write_blk(info, blk, buf); 448 ret = write_blk(info, blk, buf);
441 if (ret < 0) { 449 if (ret < 0) {
442 printk(KERN_ERR "VFS: Can't write quota data " 450 q_warn(KERN_ERR "VFS: Can't write quota data "
443 "block %u\n", blk); 451 "block %u\n", blk);
444 goto out_buf; 452 goto out_buf;
445 } 453 }
@@ -464,7 +472,7 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
464 return -ENOMEM; 472 return -ENOMEM;
465 ret = read_blk(info, *blk, buf); 473 ret = read_blk(info, *blk, buf);
466 if (ret < 0) { 474 if (ret < 0) {
467 printk(KERN_ERR "VFS: Can't read quota data block %u\n", *blk); 475 q_warn(KERN_ERR "VFS: Can't read quota data block %u\n", *blk);
468 goto out_buf; 476 goto out_buf;
469 } 477 }
470 newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]); 478 newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
@@ -488,7 +496,7 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
488 } else { 496 } else {
489 ret = write_blk(info, *blk, buf); 497 ret = write_blk(info, *blk, buf);
490 if (ret < 0) 498 if (ret < 0)
491 printk(KERN_ERR "VFS: Can't write quota tree " 499 q_warn(KERN_ERR "VFS: Can't write quota tree "
492 "block %u.\n", *blk); 500 "block %u.\n", *blk);
493 } 501 }
494 } 502 }
@@ -521,7 +529,7 @@ static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info,
521 return -ENOMEM; 529 return -ENOMEM;
522 ret = read_blk(info, blk, buf); 530 ret = read_blk(info, blk, buf);
523 if (ret < 0) { 531 if (ret < 0) {
524 printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk); 532 q_warn(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
525 goto out_buf; 533 goto out_buf;
526 } 534 }
527 ddquot = buf + sizeof(struct qt_disk_dqdbheader); 535 ddquot = buf + sizeof(struct qt_disk_dqdbheader);
@@ -531,7 +539,7 @@ static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info,
531 ddquot += info->dqi_entry_size; 539 ddquot += info->dqi_entry_size;
532 } 540 }
533 if (i == qtree_dqstr_in_blk(info)) { 541 if (i == qtree_dqstr_in_blk(info)) {
534 printk(KERN_ERR "VFS: Quota for id %u referenced " 542 q_warn(KERN_ERR "VFS: Quota for id %u referenced "
535 "but not present.\n", dquot->dq_id); 543 "but not present.\n", dquot->dq_id);
536 ret = -EIO; 544 ret = -EIO;
537 goto out_buf; 545 goto out_buf;
@@ -556,7 +564,7 @@ static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info,
556 return -ENOMEM; 564 return -ENOMEM;
557 ret = read_blk(info, blk, buf); 565 ret = read_blk(info, blk, buf);
558 if (ret < 0) { 566 if (ret < 0) {
559 printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk); 567 q_warn(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
560 goto out_buf; 568 goto out_buf;
561 } 569 }
562 ret = 0; 570 ret = 0;
@@ -599,7 +607,7 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
599 offset = find_dqentry(info, dquot); 607 offset = find_dqentry(info, dquot);
600 if (offset <= 0) { /* Entry not present? */ 608 if (offset <= 0) { /* Entry not present? */
601 if (offset < 0) 609 if (offset < 0)
602 printk(KERN_ERR "VFS: Can't read quota " 610 q_warn(KERN_ERR "VFS: Can't read quota "
603 "structure for id %u.\n", dquot->dq_id); 611 "structure for id %u.\n", dquot->dq_id);
604 dquot->dq_off = 0; 612 dquot->dq_off = 0;
605 set_bit(DQ_FAKE_B, &dquot->dq_flags); 613 set_bit(DQ_FAKE_B, &dquot->dq_flags);
@@ -617,7 +625,7 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
617 if (ret != info->dqi_entry_size) { 625 if (ret != info->dqi_entry_size) {
618 if (ret >= 0) 626 if (ret >= 0)
619 ret = -EIO; 627 ret = -EIO;
620 printk(KERN_ERR "VFS: Error while reading quota " 628 q_warn(KERN_ERR "VFS: Error while reading quota "
621 "structure for id %u.\n", dquot->dq_id); 629 "structure for id %u.\n", dquot->dq_id);
622 set_bit(DQ_FAKE_B, &dquot->dq_flags); 630 set_bit(DQ_FAKE_B, &dquot->dq_flags);
623 memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk)); 631 memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
@@ -634,7 +642,7 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
634 spin_unlock(&dq_data_lock); 642 spin_unlock(&dq_data_lock);
635 kfree(ddquot); 643 kfree(ddquot);
636out: 644out:
637 dqstats.reads++; 645 dqstats_inc(DQST_READS);
638 return ret; 646 return ret;
639} 647}
640EXPORT_SYMBOL(qtree_read_dquot); 648EXPORT_SYMBOL(qtree_read_dquot);
diff --git a/fs/quota/quota_tree.h b/fs/quota/quota_tree.h
index a1ab8db81a51..ccc3e71fb1d8 100644
--- a/fs/quota/quota_tree.h
+++ b/fs/quota/quota_tree.h
@@ -22,4 +22,10 @@ struct qt_disk_dqdbheader {
22 22
23#define QT_TREEOFF 1 /* Offset of tree in file in blocks */ 23#define QT_TREEOFF 1 /* Offset of tree in file in blocks */
24 24
25#define q_warn(fmt, args...) \
26do { \
27 if (printk_ratelimit()) \
28 printk(fmt, ## args); \
29} while(0)
30
25#endif /* _LINUX_QUOTAIO_TREE_H */ 31#endif /* _LINUX_QUOTAIO_TREE_H */
diff --git a/fs/quota/quota_v1.c b/fs/quota/quota_v1.c
index 2ae757e9c008..4af344c5852a 100644
--- a/fs/quota/quota_v1.c
+++ b/fs/quota/quota_v1.c
@@ -71,7 +71,7 @@ static int v1_read_dqblk(struct dquot *dquot)
71 dquot->dq_dqb.dqb_ihardlimit == 0 && 71 dquot->dq_dqb.dqb_ihardlimit == 0 &&
72 dquot->dq_dqb.dqb_isoftlimit == 0) 72 dquot->dq_dqb.dqb_isoftlimit == 0)
73 set_bit(DQ_FAKE_B, &dquot->dq_flags); 73 set_bit(DQ_FAKE_B, &dquot->dq_flags);
74 dqstats.reads++; 74 dqstats_inc(DQST_READS);
75 75
76 return 0; 76 return 0;
77} 77}
@@ -104,7 +104,7 @@ static int v1_commit_dqblk(struct dquot *dquot)
104 ret = 0; 104 ret = 0;
105 105
106out: 106out:
107 dqstats.writes++; 107 dqstats_inc(DQST_WRITES);
108 108
109 return ret; 109 return ret;
110} 110}
diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c
index e3da02f4986f..135206af1458 100644
--- a/fs/quota/quota_v2.c
+++ b/fs/quota/quota_v2.c
@@ -63,7 +63,7 @@ static int v2_read_header(struct super_block *sb, int type,
63 size = sb->s_op->quota_read(sb, type, (char *)dqhead, 63 size = sb->s_op->quota_read(sb, type, (char *)dqhead,
64 sizeof(struct v2_disk_dqheader), 0); 64 sizeof(struct v2_disk_dqheader), 0);
65 if (size != sizeof(struct v2_disk_dqheader)) { 65 if (size != sizeof(struct v2_disk_dqheader)) {
66 printk(KERN_WARNING "quota_v2: Failed header read:" 66 q_warn(KERN_WARNING "quota_v2: Failed header read:"
67 " expected=%zd got=%zd\n", 67 " expected=%zd got=%zd\n",
68 sizeof(struct v2_disk_dqheader), size); 68 sizeof(struct v2_disk_dqheader), size);
69 return 0; 69 return 0;
@@ -106,7 +106,7 @@ static int v2_read_file_info(struct super_block *sb, int type)
106 size = sb->s_op->quota_read(sb, type, (char *)&dinfo, 106 size = sb->s_op->quota_read(sb, type, (char *)&dinfo,
107 sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF); 107 sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF);
108 if (size != sizeof(struct v2_disk_dqinfo)) { 108 if (size != sizeof(struct v2_disk_dqinfo)) {
109 printk(KERN_WARNING "quota_v2: Can't read info structure on device %s.\n", 109 q_warn(KERN_WARNING "quota_v2: Can't read info structure on device %s.\n",
110 sb->s_id); 110 sb->s_id);
111 return -1; 111 return -1;
112 } 112 }
@@ -167,7 +167,7 @@ static int v2_write_file_info(struct super_block *sb, int type)
167 size = sb->s_op->quota_write(sb, type, (char *)&dinfo, 167 size = sb->s_op->quota_write(sb, type, (char *)&dinfo,
168 sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF); 168 sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF);
169 if (size != sizeof(struct v2_disk_dqinfo)) { 169 if (size != sizeof(struct v2_disk_dqinfo)) {
170 printk(KERN_WARNING "Can't write info structure on device %s.\n", 170 q_warn(KERN_WARNING "Can't write info structure on device %s.\n",
171 sb->s_id); 171 sb->s_id);
172 return -1; 172 return -1;
173 } 173 }
diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
index 78f613cb9c76..4884ac5ae9be 100644
--- a/fs/ramfs/file-mmu.c
+++ b/fs/ramfs/file-mmu.c
@@ -43,12 +43,13 @@ const struct file_operations ramfs_file_operations = {
43 .write = do_sync_write, 43 .write = do_sync_write,
44 .aio_write = generic_file_aio_write, 44 .aio_write = generic_file_aio_write,
45 .mmap = generic_file_mmap, 45 .mmap = generic_file_mmap,
46 .fsync = simple_sync_file, 46 .fsync = noop_fsync,
47 .splice_read = generic_file_splice_read, 47 .splice_read = generic_file_splice_read,
48 .splice_write = generic_file_splice_write, 48 .splice_write = generic_file_splice_write,
49 .llseek = generic_file_llseek, 49 .llseek = generic_file_llseek,
50}; 50};
51 51
52const struct inode_operations ramfs_file_inode_operations = { 52const struct inode_operations ramfs_file_inode_operations = {
53 .setattr = simple_setattr,
53 .getattr = simple_getattr, 54 .getattr = simple_getattr,
54}; 55};
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 1739a4aba25f..d532c20fc179 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -21,6 +21,7 @@
21#include <linux/pagevec.h> 21#include <linux/pagevec.h>
22#include <linux/mman.h> 22#include <linux/mman.h>
23#include <linux/sched.h> 23#include <linux/sched.h>
24#include <linux/slab.h>
24 25
25#include <asm/uaccess.h> 26#include <asm/uaccess.h>
26#include "internal.h" 27#include "internal.h"
@@ -41,7 +42,7 @@ const struct file_operations ramfs_file_operations = {
41 .aio_read = generic_file_aio_read, 42 .aio_read = generic_file_aio_read,
42 .write = do_sync_write, 43 .write = do_sync_write,
43 .aio_write = generic_file_aio_write, 44 .aio_write = generic_file_aio_write,
44 .fsync = simple_sync_file, 45 .fsync = noop_fsync,
45 .splice_read = generic_file_splice_read, 46 .splice_read = generic_file_splice_read,
46 .splice_write = generic_file_splice_write, 47 .splice_write = generic_file_splice_write,
47 .llseek = generic_file_llseek, 48 .llseek = generic_file_llseek,
@@ -145,7 +146,7 @@ static int ramfs_nommu_resize(struct inode *inode, loff_t newsize, loff_t size)
145 return ret; 146 return ret;
146 } 147 }
147 148
148 ret = vmtruncate(inode, newsize); 149 ret = simple_setsize(inode, newsize);
149 150
150 return ret; 151 return ret;
151} 152}
@@ -168,7 +169,8 @@ static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia)
168 169
169 /* pick out size-changing events */ 170 /* pick out size-changing events */
170 if (ia->ia_valid & ATTR_SIZE) { 171 if (ia->ia_valid & ATTR_SIZE) {
171 loff_t size = i_size_read(inode); 172 loff_t size = inode->i_size;
173
172 if (ia->ia_size != size) { 174 if (ia->ia_size != size) {
173 ret = ramfs_nommu_resize(inode, ia->ia_size, size); 175 ret = ramfs_nommu_resize(inode, ia->ia_size, size);
174 if (ret < 0 || ia->ia_valid == ATTR_SIZE) 176 if (ret < 0 || ia->ia_valid == ATTR_SIZE)
@@ -181,7 +183,7 @@ static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia)
181 } 183 }
182 } 184 }
183 185
184 ret = inode_setattr(inode, ia); 186 generic_setattr(inode, ia);
185 out: 187 out:
186 ia->ia_valid = old_ia_valid; 188 ia->ia_valid = old_ia_valid;
187 return ret; 189 return ret;
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index a6090aa1a7c1..a5ebae70dc6d 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -35,6 +35,7 @@
35#include <linux/sched.h> 35#include <linux/sched.h>
36#include <linux/parser.h> 36#include <linux/parser.h>
37#include <linux/magic.h> 37#include <linux/magic.h>
38#include <linux/slab.h>
38#include <asm/uaccess.h> 39#include <asm/uaccess.h>
39#include "internal.h" 40#include "internal.h"
40 41
@@ -51,14 +52,13 @@ static struct backing_dev_info ramfs_backing_dev_info = {
51 BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP, 52 BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP,
52}; 53};
53 54
54struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev) 55struct inode *ramfs_get_inode(struct super_block *sb,
56 const struct inode *dir, int mode, dev_t dev)
55{ 57{
56 struct inode * inode = new_inode(sb); 58 struct inode * inode = new_inode(sb);
57 59
58 if (inode) { 60 if (inode) {
59 inode->i_mode = mode; 61 inode_init_owner(inode, dir, mode);
60 inode->i_uid = current_fsuid();
61 inode->i_gid = current_fsgid();
62 inode->i_mapping->a_ops = &ramfs_aops; 62 inode->i_mapping->a_ops = &ramfs_aops;
63 inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info; 63 inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info;
64 mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); 64 mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
@@ -94,15 +94,10 @@ struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev)
94static int 94static int
95ramfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) 95ramfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
96{ 96{
97 struct inode * inode = ramfs_get_inode(dir->i_sb, mode, dev); 97 struct inode * inode = ramfs_get_inode(dir->i_sb, dir, mode, dev);
98 int error = -ENOSPC; 98 int error = -ENOSPC;
99 99
100 if (inode) { 100 if (inode) {
101 if (dir->i_mode & S_ISGID) {
102 inode->i_gid = dir->i_gid;
103 if (S_ISDIR(mode))
104 inode->i_mode |= S_ISGID;
105 }
106 d_instantiate(dentry, inode); 101 d_instantiate(dentry, inode);
107 dget(dentry); /* Extra count - pin the dentry in core */ 102 dget(dentry); /* Extra count - pin the dentry in core */
108 error = 0; 103 error = 0;
@@ -129,13 +124,11 @@ static int ramfs_symlink(struct inode * dir, struct dentry *dentry, const char *
129 struct inode *inode; 124 struct inode *inode;
130 int error = -ENOSPC; 125 int error = -ENOSPC;
131 126
132 inode = ramfs_get_inode(dir->i_sb, S_IFLNK|S_IRWXUGO, 0); 127 inode = ramfs_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0);
133 if (inode) { 128 if (inode) {
134 int l = strlen(symname)+1; 129 int l = strlen(symname)+1;
135 error = page_symlink(inode, symname, l); 130 error = page_symlink(inode, symname, l);
136 if (!error) { 131 if (!error) {
137 if (dir->i_mode & S_ISGID)
138 inode->i_gid = dir->i_gid;
139 d_instantiate(dentry, inode); 132 d_instantiate(dentry, inode);
140 dget(dentry); 133 dget(dentry);
141 dir->i_mtime = dir->i_ctime = CURRENT_TIME; 134 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
@@ -213,7 +206,7 @@ static int ramfs_parse_options(char *data, struct ramfs_mount_opts *opts)
213 return 0; 206 return 0;
214} 207}
215 208
216static int ramfs_fill_super(struct super_block * sb, void * data, int silent) 209int ramfs_fill_super(struct super_block *sb, void *data, int silent)
217{ 210{
218 struct ramfs_fs_info *fsi; 211 struct ramfs_fs_info *fsi;
219 struct inode *inode = NULL; 212 struct inode *inode = NULL;
@@ -240,7 +233,7 @@ static int ramfs_fill_super(struct super_block * sb, void * data, int silent)
240 sb->s_op = &ramfs_ops; 233 sb->s_op = &ramfs_ops;
241 sb->s_time_gran = 1; 234 sb->s_time_gran = 1;
242 235
243 inode = ramfs_get_inode(sb, S_IFDIR | fsi->mount_opts.mode, 0); 236 inode = ramfs_get_inode(sb, NULL, S_IFDIR | fsi->mount_opts.mode, 0);
244 if (!inode) { 237 if (!inode) {
245 err = -ENOMEM; 238 err = -ENOMEM;
246 goto fail; 239 goto fail;
diff --git a/fs/read_write.c b/fs/read_write.c
index b7f4a1f94d48..9c0485236e68 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -97,6 +97,23 @@ loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
97} 97}
98EXPORT_SYMBOL(generic_file_llseek); 98EXPORT_SYMBOL(generic_file_llseek);
99 99
100/**
101 * noop_llseek - No Operation Performed llseek implementation
102 * @file: file structure to seek on
103 * @offset: file offset to seek to
104 * @origin: type of seek
105 *
106 * This is an implementation of ->llseek useable for the rare special case when
107 * userspace expects the seek to succeed but the (device) file is actually not
108 * able to perform the seek. In this case you use noop_llseek() instead of
109 * falling back to the default implementation of ->llseek.
110 */
111loff_t noop_llseek(struct file *file, loff_t offset, int origin)
112{
113 return file->f_pos;
114}
115EXPORT_SYMBOL(noop_llseek);
116
100loff_t no_llseek(struct file *file, loff_t offset, int origin) 117loff_t no_llseek(struct file *file, loff_t offset, int origin)
101{ 118{
102 return -ESPIPE; 119 return -ESPIPE;
@@ -258,6 +275,7 @@ ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *pp
258 init_sync_kiocb(&kiocb, filp); 275 init_sync_kiocb(&kiocb, filp);
259 kiocb.ki_pos = *ppos; 276 kiocb.ki_pos = *ppos;
260 kiocb.ki_left = len; 277 kiocb.ki_left = len;
278 kiocb.ki_nbytes = len;
261 279
262 for (;;) { 280 for (;;) {
263 ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos); 281 ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
@@ -313,6 +331,7 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof
313 init_sync_kiocb(&kiocb, filp); 331 init_sync_kiocb(&kiocb, filp);
314 kiocb.ki_pos = *ppos; 332 kiocb.ki_pos = *ppos;
315 kiocb.ki_left = len; 333 kiocb.ki_left = len;
334 kiocb.ki_nbytes = len;
316 335
317 for (;;) { 336 for (;;) {
318 ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos); 337 ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index c094f58c7448..198dabf1b2bb 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -8,15 +8,16 @@
8#include <linux/reiserfs_fs.h> 8#include <linux/reiserfs_fs.h>
9#include <linux/stat.h> 9#include <linux/stat.h>
10#include <linux/buffer_head.h> 10#include <linux/buffer_head.h>
11#include <linux/slab.h>
11#include <asm/uaccess.h> 12#include <asm/uaccess.h>
12 13
13extern const struct reiserfs_key MIN_KEY; 14extern const struct reiserfs_key MIN_KEY;
14 15
15static int reiserfs_readdir(struct file *, void *, filldir_t); 16static int reiserfs_readdir(struct file *, void *, filldir_t);
16static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry, 17static int reiserfs_dir_fsync(struct file *filp, int datasync);
17 int datasync);
18 18
19const struct file_operations reiserfs_dir_operations = { 19const struct file_operations reiserfs_dir_operations = {
20 .llseek = generic_file_llseek,
20 .read = generic_read_dir, 21 .read = generic_read_dir,
21 .readdir = reiserfs_readdir, 22 .readdir = reiserfs_readdir,
22 .fsync = reiserfs_dir_fsync, 23 .fsync = reiserfs_dir_fsync,
@@ -26,10 +27,9 @@ const struct file_operations reiserfs_dir_operations = {
26#endif 27#endif
27}; 28};
28 29
29static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry, 30static int reiserfs_dir_fsync(struct file *filp, int datasync)
30 int datasync)
31{ 31{
32 struct inode *inode = dentry->d_inode; 32 struct inode *inode = filp->f_mapping->host;
33 int err; 33 int err;
34 reiserfs_write_lock(inode->i_sb); 34 reiserfs_write_lock(inode->i_sb);
35 err = reiserfs_commit_for_inode(inode); 35 err = reiserfs_commit_for_inode(inode);
@@ -45,8 +45,6 @@ static inline bool is_privroot_deh(struct dentry *dir,
45 struct reiserfs_de_head *deh) 45 struct reiserfs_de_head *deh)
46{ 46{
47 struct dentry *privroot = REISERFS_SB(dir->d_sb)->priv_root; 47 struct dentry *privroot = REISERFS_SB(dir->d_sb)->priv_root;
48 if (reiserfs_expose_privroot(dir->d_sb))
49 return 0;
50 return (dir == dir->d_parent && privroot->d_inode && 48 return (dir == dir->d_parent && privroot->d_inode &&
51 deh->deh_objectid == INODE_PKEY(privroot->d_inode)->k_objectid); 49 deh->deh_objectid == INODE_PKEY(privroot->d_inode)->k_objectid);
52} 50}
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 1d9c12714c5c..b82cdd8a45dd 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -134,10 +134,9 @@ static void reiserfs_vfs_truncate_file(struct inode *inode)
134 * be removed... 134 * be removed...
135 */ 135 */
136 136
137static int reiserfs_sync_file(struct file *filp, 137static int reiserfs_sync_file(struct file *filp, int datasync)
138 struct dentry *dentry, int datasync)
139{ 138{
140 struct inode *inode = dentry->d_inode; 139 struct inode *inode = filp->f_mapping->host;
141 int err; 140 int err;
142 int barrier_done; 141 int barrier_done;
143 142
@@ -147,7 +146,8 @@ static int reiserfs_sync_file(struct file *filp,
147 barrier_done = reiserfs_commit_for_inode(inode); 146 barrier_done = reiserfs_commit_for_inode(inode);
148 reiserfs_write_unlock(inode->i_sb); 147 reiserfs_write_unlock(inode->i_sb);
149 if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb)) 148 if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb))
150 blkdev_issue_flush(inode->i_sb->s_bdev, NULL); 149 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
150 BLKDEV_IFL_WAIT);
151 if (barrier_done < 0) 151 if (barrier_done < 0)
152 return barrier_done; 152 return barrier_done;
153 return (err < 0) ? -EIO : 0; 153 return (err < 0) ? -EIO : 0;
diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c
index 6591cb21edf6..1e4250bc3a6f 100644
--- a/fs/reiserfs/fix_node.c
+++ b/fs/reiserfs/fix_node.c
@@ -35,6 +35,7 @@
35 **/ 35 **/
36 36
37#include <linux/time.h> 37#include <linux/time.h>
38#include <linux/slab.h>
38#include <linux/string.h> 39#include <linux/string.h>
39#include <linux/reiserfs_fs.h> 40#include <linux/reiserfs_fs.h>
40#include <linux/buffer_head.h> 41#include <linux/buffer_head.h>
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index d1da94b82d8f..0f22fdaf54ac 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -11,6 +11,7 @@
11#include <linux/smp_lock.h> 11#include <linux/smp_lock.h>
12#include <linux/pagemap.h> 12#include <linux/pagemap.h>
13#include <linux/highmem.h> 13#include <linux/highmem.h>
14#include <linux/slab.h>
14#include <asm/uaccess.h> 15#include <asm/uaccess.h>
15#include <asm/unaligned.h> 16#include <asm/unaligned.h>
16#include <linux/buffer_head.h> 17#include <linux/buffer_head.h>
@@ -3075,9 +3076,10 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
3075 ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID); 3076 ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID);
3076 3077
3077 depth = reiserfs_write_lock_once(inode->i_sb); 3078 depth = reiserfs_write_lock_once(inode->i_sb);
3078 if (attr->ia_valid & ATTR_SIZE) { 3079 if (is_quota_modification(inode, attr))
3079 dquot_initialize(inode); 3080 dquot_initialize(inode);
3080 3081
3082 if (attr->ia_valid & ATTR_SIZE) {
3081 /* version 2 items will be caught by the s_maxbytes check 3083 /* version 2 items will be caught by the s_maxbytes check
3082 ** done for us in vmtruncate 3084 ** done for us in vmtruncate
3083 */ 3085 */
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index ba98546fabbd..19fbc810e8e7 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -50,6 +50,7 @@
50#include <linux/blkdev.h> 50#include <linux/blkdev.h>
51#include <linux/backing-dev.h> 51#include <linux/backing-dev.h>
52#include <linux/uaccess.h> 52#include <linux/uaccess.h>
53#include <linux/slab.h>
53 54
54#include <asm/system.h> 55#include <asm/system.h>
55 56
@@ -2217,6 +2218,15 @@ static int journal_read_transaction(struct super_block *sb,
2217 brelse(d_bh); 2218 brelse(d_bh);
2218 return 1; 2219 return 1;
2219 } 2220 }
2221
2222 if (bdev_read_only(sb->s_bdev)) {
2223 reiserfs_warning(sb, "clm-2076",
2224 "device is readonly, unable to replay log");
2225 brelse(c_bh);
2226 brelse(d_bh);
2227 return -EROFS;
2228 }
2229
2220 trans_id = get_desc_trans_id(desc); 2230 trans_id = get_desc_trans_id(desc);
2221 /* now we know we've got a good transaction, and it was inside the valid time ranges */ 2231 /* now we know we've got a good transaction, and it was inside the valid time ranges */
2222 log_blocks = kmalloc(get_desc_trans_len(desc) * 2232 log_blocks = kmalloc(get_desc_trans_len(desc) *
@@ -2459,12 +2469,6 @@ static int journal_read(struct super_block *sb)
2459 goto start_log_replay; 2469 goto start_log_replay;
2460 } 2470 }
2461 2471
2462 if (continue_replay && bdev_read_only(sb->s_bdev)) {
2463 reiserfs_warning(sb, "clm-2076",
2464 "device is readonly, unable to replay log");
2465 return -1;
2466 }
2467
2468 /* ok, there are transactions that need to be replayed. start with the first log block, find 2472 /* ok, there are transactions that need to be replayed. start with the first log block, find
2469 ** all the valid transactions, and pick out the oldest. 2473 ** all the valid transactions, and pick out the oldest.
2470 */ 2474 */
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 96e4cbbfaa18..ee78d4a0086a 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -13,6 +13,7 @@
13 13
14#include <linux/time.h> 14#include <linux/time.h>
15#include <linux/bitops.h> 15#include <linux/bitops.h>
16#include <linux/slab.h>
16#include <linux/reiserfs_fs.h> 17#include <linux/reiserfs_fs.h>
17#include <linux/reiserfs_acl.h> 18#include <linux/reiserfs_acl.h>
18#include <linux/reiserfs_xattr.h> 19#include <linux/reiserfs_xattr.h>
@@ -560,23 +561,13 @@ static int drop_new_inode(struct inode *inode)
560*/ 561*/
561static int new_inode_init(struct inode *inode, struct inode *dir, int mode) 562static int new_inode_init(struct inode *inode, struct inode *dir, int mode)
562{ 563{
563
564 /* the quota init calls have to know who to charge the quota to, so
565 ** we have to set uid and gid here
566 */
567 inode->i_uid = current_fsuid();
568 inode->i_mode = mode;
569 /* Make inode invalid - just in case we are going to drop it before 564 /* Make inode invalid - just in case we are going to drop it before
570 * the initialization happens */ 565 * the initialization happens */
571 INODE_PKEY(inode)->k_objectid = 0; 566 INODE_PKEY(inode)->k_objectid = 0;
572 567 /* the quota init calls have to know who to charge the quota to, so
573 if (dir->i_mode & S_ISGID) { 568 ** we have to set uid and gid here
574 inode->i_gid = dir->i_gid; 569 */
575 if (S_ISDIR(mode)) 570 inode_init_owner(inode, dir, mode);
576 inode->i_mode |= S_ISGID;
577 } else {
578 inode->i_gid = current_fsgid();
579 }
580 dquot_initialize(inode); 571 dquot_initialize(inode);
581 return 0; 572 return 0;
582} 573}
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 04bf5d791bda..9822fa15118b 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -12,6 +12,7 @@
12 */ 12 */
13 13
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/slab.h>
15#include <linux/vmalloc.h> 16#include <linux/vmalloc.h>
16#include <linux/time.h> 17#include <linux/time.h>
17#include <asm/uaccess.h> 18#include <asm/uaccess.h>
@@ -157,6 +158,7 @@ static int finish_unfinished(struct super_block *s)
157#ifdef CONFIG_QUOTA 158#ifdef CONFIG_QUOTA
158 int i; 159 int i;
159 int ms_active_set; 160 int ms_active_set;
161 int quota_enabled[MAXQUOTAS];
160#endif 162#endif
161 163
162 /* compose key to look for "save" links */ 164 /* compose key to look for "save" links */
@@ -178,8 +180,15 @@ static int finish_unfinished(struct super_block *s)
178 } 180 }
179 /* Turn on quotas so that they are updated correctly */ 181 /* Turn on quotas so that they are updated correctly */
180 for (i = 0; i < MAXQUOTAS; i++) { 182 for (i = 0; i < MAXQUOTAS; i++) {
183 quota_enabled[i] = 1;
181 if (REISERFS_SB(s)->s_qf_names[i]) { 184 if (REISERFS_SB(s)->s_qf_names[i]) {
182 int ret = reiserfs_quota_on_mount(s, i); 185 int ret;
186
187 if (sb_has_quota_active(s, i)) {
188 quota_enabled[i] = 0;
189 continue;
190 }
191 ret = reiserfs_quota_on_mount(s, i);
183 if (ret < 0) 192 if (ret < 0)
184 reiserfs_warning(s, "reiserfs-2500", 193 reiserfs_warning(s, "reiserfs-2500",
185 "cannot turn on journaled " 194 "cannot turn on journaled "
@@ -303,8 +312,8 @@ static int finish_unfinished(struct super_block *s)
303#ifdef CONFIG_QUOTA 312#ifdef CONFIG_QUOTA
304 /* Turn quotas off */ 313 /* Turn quotas off */
305 for (i = 0; i < MAXQUOTAS; i++) { 314 for (i = 0; i < MAXQUOTAS; i++) {
306 if (sb_dqopt(s)->files[i]) 315 if (sb_dqopt(s)->files[i] && quota_enabled[i])
307 vfs_quota_off(s, i, 0); 316 dquot_quota_off(s, i);
308 } 317 }
309 if (ms_active_set) 318 if (ms_active_set)
310 /* Restore the flag back */ 319 /* Restore the flag back */
@@ -465,6 +474,8 @@ static void reiserfs_put_super(struct super_block *s)
465 struct reiserfs_transaction_handle th; 474 struct reiserfs_transaction_handle th;
466 th.t_trans_id = 0; 475 th.t_trans_id = 0;
467 476
477 dquot_disable(s, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
478
468 reiserfs_write_lock(s); 479 reiserfs_write_lock(s);
469 480
470 if (s->s_dirt) 481 if (s->s_dirt)
@@ -619,7 +630,7 @@ static int reiserfs_acquire_dquot(struct dquot *);
619static int reiserfs_release_dquot(struct dquot *); 630static int reiserfs_release_dquot(struct dquot *);
620static int reiserfs_mark_dquot_dirty(struct dquot *); 631static int reiserfs_mark_dquot_dirty(struct dquot *);
621static int reiserfs_write_info(struct super_block *, int); 632static int reiserfs_write_info(struct super_block *, int);
622static int reiserfs_quota_on(struct super_block *, int, int, char *, int); 633static int reiserfs_quota_on(struct super_block *, int, int, char *);
623 634
624static const struct dquot_operations reiserfs_quota_operations = { 635static const struct dquot_operations reiserfs_quota_operations = {
625 .write_dquot = reiserfs_write_dquot, 636 .write_dquot = reiserfs_write_dquot,
@@ -633,12 +644,12 @@ static const struct dquot_operations reiserfs_quota_operations = {
633 644
634static const struct quotactl_ops reiserfs_qctl_operations = { 645static const struct quotactl_ops reiserfs_qctl_operations = {
635 .quota_on = reiserfs_quota_on, 646 .quota_on = reiserfs_quota_on,
636 .quota_off = vfs_quota_off, 647 .quota_off = dquot_quota_off,
637 .quota_sync = vfs_quota_sync, 648 .quota_sync = dquot_quota_sync,
638 .get_info = vfs_get_dqinfo, 649 .get_info = dquot_get_dqinfo,
639 .set_info = vfs_set_dqinfo, 650 .set_info = dquot_set_dqinfo,
640 .get_dqblk = vfs_get_dqblk, 651 .get_dqblk = dquot_get_dqblk,
641 .set_dqblk = vfs_set_dqblk, 652 .set_dqblk = dquot_set_dqblk,
642}; 653};
643#endif 654#endif
644 655
@@ -1241,6 +1252,11 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
1241 if (s->s_flags & MS_RDONLY) 1252 if (s->s_flags & MS_RDONLY)
1242 /* it is read-only already */ 1253 /* it is read-only already */
1243 goto out_ok; 1254 goto out_ok;
1255
1256 err = dquot_suspend(s, -1);
1257 if (err < 0)
1258 goto out_err;
1259
1244 /* try to remount file system with read-only permissions */ 1260 /* try to remount file system with read-only permissions */
1245 if (sb_umount_state(rs) == REISERFS_VALID_FS 1261 if (sb_umount_state(rs) == REISERFS_VALID_FS
1246 || REISERFS_SB(s)->s_mount_state != REISERFS_VALID_FS) { 1262 || REISERFS_SB(s)->s_mount_state != REISERFS_VALID_FS) {
@@ -1294,6 +1310,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
1294 s->s_dirt = 0; 1310 s->s_dirt = 0;
1295 1311
1296 if (!(*mount_flags & MS_RDONLY)) { 1312 if (!(*mount_flags & MS_RDONLY)) {
1313 dquot_resume(s, -1);
1297 finish_unfinished(s); 1314 finish_unfinished(s);
1298 reiserfs_xattr_init(s, *mount_flags); 1315 reiserfs_xattr_init(s, *mount_flags);
1299 } 1316 }
@@ -1618,10 +1635,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
1618 save_mount_options(s, data); 1635 save_mount_options(s, data);
1619 1636
1620 sbi = kzalloc(sizeof(struct reiserfs_sb_info), GFP_KERNEL); 1637 sbi = kzalloc(sizeof(struct reiserfs_sb_info), GFP_KERNEL);
1621 if (!sbi) { 1638 if (!sbi)
1622 errval = -ENOMEM; 1639 return -ENOMEM;
1623 goto error_alloc;
1624 }
1625 s->s_fs_info = sbi; 1640 s->s_fs_info = sbi;
1626 /* Set default values for options: non-aggressive tails, RO on errors */ 1641 /* Set default values for options: non-aggressive tails, RO on errors */
1627 REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_SMALLTAIL); 1642 REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_SMALLTAIL);
@@ -1878,12 +1893,12 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
1878 return (0); 1893 return (0);
1879 1894
1880error: 1895error:
1881 reiserfs_write_unlock(s);
1882error_alloc:
1883 if (jinit_done) { /* kill the commit thread, free journal ram */ 1896 if (jinit_done) { /* kill the commit thread, free journal ram */
1884 journal_release_error(NULL, s); 1897 journal_release_error(NULL, s);
1885 } 1898 }
1886 1899
1900 reiserfs_write_unlock(s);
1901
1887 reiserfs_free_bitmap_cache(s); 1902 reiserfs_free_bitmap_cache(s);
1888 if (SB_BUFFER_WITH_SB(s)) 1903 if (SB_BUFFER_WITH_SB(s))
1889 brelse(SB_BUFFER_WITH_SB(s)); 1904 brelse(SB_BUFFER_WITH_SB(s));
@@ -2023,15 +2038,15 @@ static int reiserfs_write_info(struct super_block *sb, int type)
2023 */ 2038 */
2024static int reiserfs_quota_on_mount(struct super_block *sb, int type) 2039static int reiserfs_quota_on_mount(struct super_block *sb, int type)
2025{ 2040{
2026 return vfs_quota_on_mount(sb, REISERFS_SB(sb)->s_qf_names[type], 2041 return dquot_quota_on_mount(sb, REISERFS_SB(sb)->s_qf_names[type],
2027 REISERFS_SB(sb)->s_jquota_fmt, type); 2042 REISERFS_SB(sb)->s_jquota_fmt, type);
2028} 2043}
2029 2044
2030/* 2045/*
2031 * Standard function to be called on quota_on 2046 * Standard function to be called on quota_on
2032 */ 2047 */
2033static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, 2048static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
2034 char *name, int remount) 2049 char *name)
2035{ 2050{
2036 int err; 2051 int err;
2037 struct path path; 2052 struct path path;
@@ -2040,9 +2055,7 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
2040 2055
2041 if (!(REISERFS_SB(sb)->s_mount_opt & (1 << REISERFS_QUOTA))) 2056 if (!(REISERFS_SB(sb)->s_mount_opt & (1 << REISERFS_QUOTA)))
2042 return -EINVAL; 2057 return -EINVAL;
2043 /* No more checks needed? Path and format_id are bogus anyway... */ 2058
2044 if (remount)
2045 return vfs_quota_on(sb, type, format_id, name, 1);
2046 err = kern_path(name, LOOKUP_FOLLOW, &path); 2059 err = kern_path(name, LOOKUP_FOLLOW, &path);
2047 if (err) 2060 if (err)
2048 return err; 2061 return err;
@@ -2086,7 +2099,7 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
2086 if (err) 2099 if (err)
2087 goto out; 2100 goto out;
2088 } 2101 }
2089 err = vfs_quota_on_path(sb, type, format_id, &path); 2102 err = dquot_quota_on_path(sb, type, format_id, &path);
2090out: 2103out:
2091 path_put(&path); 2104 path_put(&path);
2092 return err; 2105 return err;
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 37d034ca7d99..8c4cf273c672 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -38,6 +38,7 @@
38#include <linux/dcache.h> 38#include <linux/dcache.h>
39#include <linux/namei.h> 39#include <linux/namei.h>
40#include <linux/errno.h> 40#include <linux/errno.h>
41#include <linux/gfp.h>
41#include <linux/fs.h> 42#include <linux/fs.h>
42#include <linux/file.h> 43#include <linux/file.h>
43#include <linux/pagemap.h> 44#include <linux/pagemap.h>
@@ -553,7 +554,7 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
553 if (!err && new_size < i_size_read(dentry->d_inode)) { 554 if (!err && new_size < i_size_read(dentry->d_inode)) {
554 struct iattr newattrs = { 555 struct iattr newattrs = {
555 .ia_ctime = current_fs_time(inode->i_sb), 556 .ia_ctime = current_fs_time(inode->i_sb),
556 .ia_size = buffer_size, 557 .ia_size = new_size,
557 .ia_valid = ATTR_SIZE | ATTR_CTIME, 558 .ia_valid = ATTR_SIZE | ATTR_CTIME,
558 }; 559 };
559 560
@@ -722,11 +723,11 @@ out:
722 (handler) = *(handlers)++) 723 (handler) = *(handlers)++)
723 724
724/* This is the implementation for the xattr plugin infrastructure */ 725/* This is the implementation for the xattr plugin infrastructure */
725static inline struct xattr_handler * 726static inline const struct xattr_handler *
726find_xattr_handler_prefix(struct xattr_handler **handlers, 727find_xattr_handler_prefix(const struct xattr_handler **handlers,
727 const char *name) 728 const char *name)
728{ 729{
729 struct xattr_handler *xah; 730 const struct xattr_handler *xah;
730 731
731 if (!handlers) 732 if (!handlers)
732 return NULL; 733 return NULL;
@@ -747,7 +748,7 @@ ssize_t
747reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer, 748reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer,
748 size_t size) 749 size_t size)
749{ 750{
750 struct xattr_handler *handler; 751 const struct xattr_handler *handler;
751 752
752 handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name); 753 handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name);
753 754
@@ -766,7 +767,7 @@ int
766reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value, 767reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value,
767 size_t size, int flags) 768 size_t size, int flags)
768{ 769{
769 struct xattr_handler *handler; 770 const struct xattr_handler *handler;
770 771
771 handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name); 772 handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name);
772 773
@@ -783,7 +784,7 @@ reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value,
783 */ 784 */
784int reiserfs_removexattr(struct dentry *dentry, const char *name) 785int reiserfs_removexattr(struct dentry *dentry, const char *name)
785{ 786{
786 struct xattr_handler *handler; 787 const struct xattr_handler *handler;
787 handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name); 788 handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name);
788 789
789 if (!handler || get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1) 790 if (!handler || get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1)
@@ -806,7 +807,7 @@ static int listxattr_filler(void *buf, const char *name, int namelen,
806 size_t size; 807 size_t size;
807 if (name[0] != '.' || 808 if (name[0] != '.' ||
808 (namelen != 1 && (name[1] != '.' || namelen != 2))) { 809 (namelen != 1 && (name[1] != '.' || namelen != 2))) {
809 struct xattr_handler *handler; 810 const struct xattr_handler *handler;
810 handler = find_xattr_handler_prefix(b->dentry->d_sb->s_xattr, 811 handler = find_xattr_handler_prefix(b->dentry->d_sb->s_xattr,
811 name); 812 name);
812 if (!handler) /* Unsupported xattr name */ 813 if (!handler) /* Unsupported xattr name */
@@ -919,7 +920,7 @@ static int create_privroot(struct dentry *dentry) { return 0; }
919#endif 920#endif
920 921
921/* Actual operations that are exported to VFS-land */ 922/* Actual operations that are exported to VFS-land */
922struct xattr_handler *reiserfs_xattr_handlers[] = { 923const struct xattr_handler *reiserfs_xattr_handlers[] = {
923#ifdef CONFIG_REISERFS_FS_XATTR 924#ifdef CONFIG_REISERFS_FS_XATTR
924 &reiserfs_xattr_user_handler, 925 &reiserfs_xattr_user_handler,
925 &reiserfs_xattr_trusted_handler, 926 &reiserfs_xattr_trusted_handler,
@@ -972,21 +973,13 @@ int reiserfs_permission(struct inode *inode, int mask)
972 return generic_permission(inode, mask, NULL); 973 return generic_permission(inode, mask, NULL);
973} 974}
974 975
975/* This will catch lookups from the fs root to .reiserfs_priv */ 976static int xattr_hide_revalidate(struct dentry *dentry, struct nameidata *nd)
976static int
977xattr_lookup_poison(struct dentry *dentry, struct qstr *q1, struct qstr *name)
978{ 977{
979 struct dentry *priv_root = REISERFS_SB(dentry->d_sb)->priv_root; 978 return -EPERM;
980 if (container_of(q1, struct dentry, d_name) == priv_root)
981 return -ENOENT;
982 if (q1->len == name->len &&
983 !memcmp(q1->name, name->name, name->len))
984 return 0;
985 return 1;
986} 979}
987 980
988static const struct dentry_operations xattr_lookup_poison_ops = { 981static const struct dentry_operations xattr_lookup_poison_ops = {
989 .d_compare = xattr_lookup_poison, 982 .d_revalidate = xattr_hide_revalidate,
990}; 983};
991 984
992int reiserfs_lookup_privroot(struct super_block *s) 985int reiserfs_lookup_privroot(struct super_block *s)
@@ -1000,8 +993,7 @@ int reiserfs_lookup_privroot(struct super_block *s)
1000 strlen(PRIVROOT_NAME)); 993 strlen(PRIVROOT_NAME));
1001 if (!IS_ERR(dentry)) { 994 if (!IS_ERR(dentry)) {
1002 REISERFS_SB(s)->priv_root = dentry; 995 REISERFS_SB(s)->priv_root = dentry;
1003 if (!reiserfs_expose_privroot(s)) 996 dentry->d_op = &xattr_lookup_poison_ops;
1004 s->s_root->d_op = &xattr_lookup_poison_ops;
1005 if (dentry->d_inode) 997 if (dentry->d_inode)
1006 dentry->d_inode->i_flags |= S_PRIVATE; 998 dentry->d_inode->i_flags |= S_PRIVATE;
1007 } else 999 } else
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index dd20a7883f0f..536d697a8a28 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -5,6 +5,7 @@
5#include <linux/errno.h> 5#include <linux/errno.h>
6#include <linux/pagemap.h> 6#include <linux/pagemap.h>
7#include <linux/xattr.h> 7#include <linux/xattr.h>
8#include <linux/slab.h>
8#include <linux/posix_acl_xattr.h> 9#include <linux/posix_acl_xattr.h>
9#include <linux/reiserfs_xattr.h> 10#include <linux/reiserfs_xattr.h>
10#include <linux/reiserfs_acl.h> 11#include <linux/reiserfs_acl.h>
@@ -499,7 +500,7 @@ static size_t posix_acl_access_list(struct dentry *dentry, char *list,
499 return size; 500 return size;
500} 501}
501 502
502struct xattr_handler reiserfs_posix_acl_access_handler = { 503const struct xattr_handler reiserfs_posix_acl_access_handler = {
503 .prefix = POSIX_ACL_XATTR_ACCESS, 504 .prefix = POSIX_ACL_XATTR_ACCESS,
504 .flags = ACL_TYPE_ACCESS, 505 .flags = ACL_TYPE_ACCESS,
505 .get = posix_acl_get, 506 .get = posix_acl_get,
@@ -519,7 +520,7 @@ static size_t posix_acl_default_list(struct dentry *dentry, char *list,
519 return size; 520 return size;
520} 521}
521 522
522struct xattr_handler reiserfs_posix_acl_default_handler = { 523const struct xattr_handler reiserfs_posix_acl_default_handler = {
523 .prefix = POSIX_ACL_XATTR_DEFAULT, 524 .prefix = POSIX_ACL_XATTR_DEFAULT,
524 .flags = ACL_TYPE_DEFAULT, 525 .flags = ACL_TYPE_DEFAULT,
525 .get = posix_acl_get, 526 .get = posix_acl_get,
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index d8b5bfcbdd30..237c6928d3c6 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -3,6 +3,7 @@
3#include <linux/fs.h> 3#include <linux/fs.h>
4#include <linux/pagemap.h> 4#include <linux/pagemap.h>
5#include <linux/xattr.h> 5#include <linux/xattr.h>
6#include <linux/slab.h>
6#include <linux/reiserfs_xattr.h> 7#include <linux/reiserfs_xattr.h>
7#include <linux/security.h> 8#include <linux/security.h>
8#include <asm/uaccess.h> 9#include <asm/uaccess.h>
@@ -76,7 +77,7 @@ int reiserfs_security_init(struct inode *dir, struct inode *inode,
76 return error; 77 return error;
77 } 78 }
78 79
79 if (sec->length) { 80 if (sec->length && reiserfs_xattrs_initialized(inode->i_sb)) {
80 blocks = reiserfs_xattr_jcreate_nblocks(inode) + 81 blocks = reiserfs_xattr_jcreate_nblocks(inode) +
81 reiserfs_xattr_nblocks(inode, sec->length); 82 reiserfs_xattr_nblocks(inode, sec->length);
82 /* We don't want to count the directories twice if we have 83 /* We don't want to count the directories twice if we have
@@ -110,7 +111,7 @@ void reiserfs_security_free(struct reiserfs_security_handle *sec)
110 sec->value = NULL; 111 sec->value = NULL;
111} 112}
112 113
113struct xattr_handler reiserfs_xattr_security_handler = { 114const struct xattr_handler reiserfs_xattr_security_handler = {
114 .prefix = XATTR_SECURITY_PREFIX, 115 .prefix = XATTR_SECURITY_PREFIX,
115 .get = security_get, 116 .get = security_get,
116 .set = security_set, 117 .set = security_set,
diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c
index 5b08aaca3daf..9883736ce3ec 100644
--- a/fs/reiserfs/xattr_trusted.c
+++ b/fs/reiserfs/xattr_trusted.c
@@ -48,7 +48,7 @@ static size_t trusted_list(struct dentry *dentry, char *list, size_t list_size,
48 return len; 48 return len;
49} 49}
50 50
51struct xattr_handler reiserfs_xattr_trusted_handler = { 51const struct xattr_handler reiserfs_xattr_trusted_handler = {
52 .prefix = XATTR_TRUSTED_PREFIX, 52 .prefix = XATTR_TRUSTED_PREFIX,
53 .get = trusted_get, 53 .get = trusted_get,
54 .set = trusted_set, 54 .set = trusted_set,
diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c
index 75d59c49b911..45ae1a00013a 100644
--- a/fs/reiserfs/xattr_user.c
+++ b/fs/reiserfs/xattr_user.c
@@ -44,7 +44,7 @@ static size_t user_list(struct dentry *dentry, char *list, size_t list_size,
44 return len; 44 return len;
45} 45}
46 46
47struct xattr_handler reiserfs_xattr_user_handler = { 47const struct xattr_handler reiserfs_xattr_user_handler = {
48 .prefix = XATTR_USER_PREFIX, 48 .prefix = XATTR_USER_PREFIX,
49 .get = user_get, 49 .get = user_get,
50 .set = user_set, 50 .set = user_set,
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 1dabe4ee02fe..f329849ce3c0 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -22,6 +22,7 @@
22#include <linux/init.h> 22#include <linux/init.h>
23#include <linux/fs.h> 23#include <linux/fs.h>
24#include <linux/sched.h> 24#include <linux/sched.h>
25#include <linux/slab.h>
25#include <linux/kernel.h> 26#include <linux/kernel.h>
26#include <linux/signal.h> 27#include <linux/signal.h>
27#include <linux/list.h> 28#include <linux/list.h>
diff --git a/fs/smbfs/dir.c b/fs/smbfs/dir.c
index 3e4803b4427e..00a70cab1f36 100644
--- a/fs/smbfs/dir.c
+++ b/fs/smbfs/dir.c
@@ -37,9 +37,10 @@ static int smb_link(struct dentry *, struct inode *, struct dentry *);
37 37
38const struct file_operations smb_dir_operations = 38const struct file_operations smb_dir_operations =
39{ 39{
40 .llseek = generic_file_llseek,
40 .read = generic_read_dir, 41 .read = generic_read_dir,
41 .readdir = smb_readdir, 42 .readdir = smb_readdir,
42 .ioctl = smb_ioctl, 43 .unlocked_ioctl = smb_ioctl,
43 .open = smb_dir_open, 44 .open = smb_dir_open,
44}; 45};
45 46
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index 92d5e8ffb639..8e187a0f94bb 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -13,7 +13,6 @@
13#include <linux/fcntl.h> 13#include <linux/fcntl.h>
14#include <linux/stat.h> 14#include <linux/stat.h>
15#include <linux/mm.h> 15#include <linux/mm.h>
16#include <linux/slab.h>
17#include <linux/pagemap.h> 16#include <linux/pagemap.h>
18#include <linux/smp_lock.h> 17#include <linux/smp_lock.h>
19#include <linux/net.h> 18#include <linux/net.h>
@@ -29,8 +28,9 @@
29#include "proto.h" 28#include "proto.h"
30 29
31static int 30static int
32smb_fsync(struct file *file, struct dentry * dentry, int datasync) 31smb_fsync(struct file *file, int datasync)
33{ 32{
33 struct dentry *dentry = file->f_path.dentry;
34 struct smb_sb_info *server = server_from_dentry(dentry); 34 struct smb_sb_info *server = server_from_dentry(dentry);
35 int result; 35 int result;
36 36
@@ -438,7 +438,7 @@ const struct file_operations smb_file_operations =
438 .aio_read = smb_file_aio_read, 438 .aio_read = smb_file_aio_read,
439 .write = do_sync_write, 439 .write = do_sync_write,
440 .aio_write = smb_file_aio_write, 440 .aio_write = smb_file_aio_write,
441 .ioctl = smb_ioctl, 441 .unlocked_ioctl = smb_ioctl,
442 .mmap = smb_file_mmap, 442 .mmap = smb_file_mmap,
443 .open = smb_file_open, 443 .open = smb_file_open,
444 .release = smb_file_release, 444 .release = smb_file_release,
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index 1c4c8f089970..9551cb6f7fe4 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -479,6 +479,7 @@ smb_put_super(struct super_block *sb)
479 if (server->conn_pid) 479 if (server->conn_pid)
480 kill_pid(server->conn_pid, SIGTERM, 1); 480 kill_pid(server->conn_pid, SIGTERM, 1);
481 481
482 bdi_destroy(&server->bdi);
482 kfree(server->ops); 483 kfree(server->ops);
483 smb_unload_nls(server); 484 smb_unload_nls(server);
484 sb->s_fs_info = NULL; 485 sb->s_fs_info = NULL;
@@ -525,6 +526,11 @@ static int smb_fill_super(struct super_block *sb, void *raw_data, int silent)
525 if (!server) 526 if (!server)
526 goto out_no_server; 527 goto out_no_server;
527 sb->s_fs_info = server; 528 sb->s_fs_info = server;
529
530 if (bdi_setup_and_register(&server->bdi, "smbfs", BDI_CAP_MAP_COPY))
531 goto out_bdi;
532
533 sb->s_bdi = &server->bdi;
528 534
529 server->super_block = sb; 535 server->super_block = sb;
530 server->mnt = NULL; 536 server->mnt = NULL;
@@ -624,6 +630,8 @@ out_no_smbiod:
624out_bad_option: 630out_bad_option:
625 kfree(mem); 631 kfree(mem);
626out_no_mem: 632out_no_mem:
633 bdi_destroy(&server->bdi);
634out_bdi:
627 if (!server->mnt) 635 if (!server->mnt)
628 printk(KERN_ERR "smb_fill_super: allocation failure\n"); 636 printk(KERN_ERR "smb_fill_super: allocation failure\n");
629 sb->s_fs_info = NULL; 637 sb->s_fs_info = NULL;
@@ -706,7 +714,7 @@ smb_notify_change(struct dentry *dentry, struct iattr *attr)
706 error = server->ops->truncate(inode, attr->ia_size); 714 error = server->ops->truncate(inode, attr->ia_size);
707 if (error) 715 if (error)
708 goto out; 716 goto out;
709 error = vmtruncate(inode, attr->ia_size); 717 error = simple_setsize(inode, attr->ia_size);
710 if (error) 718 if (error)
711 goto out; 719 goto out;
712 refresh = 1; 720 refresh = 1;
diff --git a/fs/smbfs/ioctl.c b/fs/smbfs/ioctl.c
index dbae1f8ea26f..07215312ad39 100644
--- a/fs/smbfs/ioctl.c
+++ b/fs/smbfs/ioctl.c
@@ -13,6 +13,7 @@
13#include <linux/time.h> 13#include <linux/time.h>
14#include <linux/mm.h> 14#include <linux/mm.h>
15#include <linux/highuid.h> 15#include <linux/highuid.h>
16#include <linux/smp_lock.h>
16#include <linux/net.h> 17#include <linux/net.h>
17 18
18#include <linux/smb_fs.h> 19#include <linux/smb_fs.h>
@@ -22,14 +23,14 @@
22 23
23#include "proto.h" 24#include "proto.h"
24 25
25int 26long
26smb_ioctl(struct inode *inode, struct file *filp, 27smb_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
27 unsigned int cmd, unsigned long arg)
28{ 28{
29 struct smb_sb_info *server = server_from_inode(inode); 29 struct smb_sb_info *server = server_from_inode(filp->f_path.dentry->d_inode);
30 struct smb_conn_opt opt; 30 struct smb_conn_opt opt;
31 int result = -EINVAL; 31 int result = -EINVAL;
32 32
33 lock_kernel();
33 switch (cmd) { 34 switch (cmd) {
34 uid16_t uid16; 35 uid16_t uid16;
35 uid_t uid32; 36 uid_t uid32;
@@ -62,6 +63,7 @@ smb_ioctl(struct inode *inode, struct file *filp,
62 default: 63 default:
63 break; 64 break;
64 } 65 }
66 unlock_kernel();
65 67
66 return result; 68 return result;
67} 69}
diff --git a/fs/smbfs/proto.h b/fs/smbfs/proto.h
index 03f456c1b7d4..05939a6f43e6 100644
--- a/fs/smbfs/proto.h
+++ b/fs/smbfs/proto.h
@@ -67,7 +67,7 @@ extern const struct address_space_operations smb_file_aops;
67extern const struct file_operations smb_file_operations; 67extern const struct file_operations smb_file_operations;
68extern const struct inode_operations smb_file_inode_operations; 68extern const struct inode_operations smb_file_inode_operations;
69/* ioctl.c */ 69/* ioctl.c */
70extern int smb_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg); 70extern long smb_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
71/* smbiod.c */ 71/* smbiod.c */
72extern void smbiod_wake_up(void); 72extern void smbiod_wake_up(void);
73extern int smbiod_register_server(struct smb_sb_info *server); 73extern int smbiod_register_server(struct smb_sb_info *server);
diff --git a/fs/smbfs/smbiod.c b/fs/smbfs/smbiod.c
index 6bd9b691a463..0e39a924f10a 100644
--- a/fs/smbfs/smbiod.c
+++ b/fs/smbfs/smbiod.c
@@ -12,7 +12,6 @@
12#include <linux/string.h> 12#include <linux/string.h>
13#include <linux/stat.h> 13#include <linux/stat.h>
14#include <linux/errno.h> 14#include <linux/errno.h>
15#include <linux/slab.h>
16#include <linux/init.h> 15#include <linux/init.h>
17#include <linux/file.h> 16#include <linux/file.h>
18#include <linux/dcache.h> 17#include <linux/dcache.h>
diff --git a/fs/splice.c b/fs/splice.c
index 39208663aaf1..ac22b00d86c3 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -30,6 +30,7 @@
30#include <linux/syscalls.h> 30#include <linux/syscalls.h>
31#include <linux/uio.h> 31#include <linux/uio.h>
32#include <linux/security.h> 32#include <linux/security.h>
33#include <linux/gfp.h>
33 34
34/* 35/*
35 * Attempt to steal a page from a pipe buffer. This should perhaps go into 36 * Attempt to steal a page from a pipe buffer. This should perhaps go into
@@ -192,8 +193,8 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
192 break; 193 break;
193 } 194 }
194 195
195 if (pipe->nrbufs < PIPE_BUFFERS) { 196 if (pipe->nrbufs < pipe->buffers) {
196 int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1); 197 int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
197 struct pipe_buffer *buf = pipe->bufs + newbuf; 198 struct pipe_buffer *buf = pipe->bufs + newbuf;
198 199
199 buf->page = spd->pages[page_nr]; 200 buf->page = spd->pages[page_nr];
@@ -213,7 +214,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
213 214
214 if (!--spd->nr_pages) 215 if (!--spd->nr_pages)
215 break; 216 break;
216 if (pipe->nrbufs < PIPE_BUFFERS) 217 if (pipe->nrbufs < pipe->buffers)
217 continue; 218 continue;
218 219
219 break; 220 break;
@@ -264,6 +265,36 @@ static void spd_release_page(struct splice_pipe_desc *spd, unsigned int i)
264 page_cache_release(spd->pages[i]); 265 page_cache_release(spd->pages[i]);
265} 266}
266 267
268/*
269 * Check if we need to grow the arrays holding pages and partial page
270 * descriptions.
271 */
272int splice_grow_spd(struct pipe_inode_info *pipe, struct splice_pipe_desc *spd)
273{
274 if (pipe->buffers <= PIPE_DEF_BUFFERS)
275 return 0;
276
277 spd->pages = kmalloc(pipe->buffers * sizeof(struct page *), GFP_KERNEL);
278 spd->partial = kmalloc(pipe->buffers * sizeof(struct partial_page), GFP_KERNEL);
279
280 if (spd->pages && spd->partial)
281 return 0;
282
283 kfree(spd->pages);
284 kfree(spd->partial);
285 return -ENOMEM;
286}
287
288void splice_shrink_spd(struct pipe_inode_info *pipe,
289 struct splice_pipe_desc *spd)
290{
291 if (pipe->buffers <= PIPE_DEF_BUFFERS)
292 return;
293
294 kfree(spd->pages);
295 kfree(spd->partial);
296}
297
267static int 298static int
268__generic_file_splice_read(struct file *in, loff_t *ppos, 299__generic_file_splice_read(struct file *in, loff_t *ppos,
269 struct pipe_inode_info *pipe, size_t len, 300 struct pipe_inode_info *pipe, size_t len,
@@ -271,8 +302,8 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
271{ 302{
272 struct address_space *mapping = in->f_mapping; 303 struct address_space *mapping = in->f_mapping;
273 unsigned int loff, nr_pages, req_pages; 304 unsigned int loff, nr_pages, req_pages;
274 struct page *pages[PIPE_BUFFERS]; 305 struct page *pages[PIPE_DEF_BUFFERS];
275 struct partial_page partial[PIPE_BUFFERS]; 306 struct partial_page partial[PIPE_DEF_BUFFERS];
276 struct page *page; 307 struct page *page;
277 pgoff_t index, end_index; 308 pgoff_t index, end_index;
278 loff_t isize; 309 loff_t isize;
@@ -285,15 +316,18 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
285 .spd_release = spd_release_page, 316 .spd_release = spd_release_page,
286 }; 317 };
287 318
319 if (splice_grow_spd(pipe, &spd))
320 return -ENOMEM;
321
288 index = *ppos >> PAGE_CACHE_SHIFT; 322 index = *ppos >> PAGE_CACHE_SHIFT;
289 loff = *ppos & ~PAGE_CACHE_MASK; 323 loff = *ppos & ~PAGE_CACHE_MASK;
290 req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 324 req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
291 nr_pages = min(req_pages, (unsigned)PIPE_BUFFERS); 325 nr_pages = min(req_pages, pipe->buffers);
292 326
293 /* 327 /*
294 * Lookup the (hopefully) full range of pages we need. 328 * Lookup the (hopefully) full range of pages we need.
295 */ 329 */
296 spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages); 330 spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, spd.pages);
297 index += spd.nr_pages; 331 index += spd.nr_pages;
298 332
299 /* 333 /*
@@ -334,7 +368,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
334 unlock_page(page); 368 unlock_page(page);
335 } 369 }
336 370
337 pages[spd.nr_pages++] = page; 371 spd.pages[spd.nr_pages++] = page;
338 index++; 372 index++;
339 } 373 }
340 374
@@ -355,7 +389,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
355 * this_len is the max we'll use from this page 389 * this_len is the max we'll use from this page
356 */ 390 */
357 this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); 391 this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
358 page = pages[page_nr]; 392 page = spd.pages[page_nr];
359 393
360 if (PageReadahead(page)) 394 if (PageReadahead(page))
361 page_cache_async_readahead(mapping, &in->f_ra, in, 395 page_cache_async_readahead(mapping, &in->f_ra, in,
@@ -392,8 +426,8 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
392 error = -ENOMEM; 426 error = -ENOMEM;
393 break; 427 break;
394 } 428 }
395 page_cache_release(pages[page_nr]); 429 page_cache_release(spd.pages[page_nr]);
396 pages[page_nr] = page; 430 spd.pages[page_nr] = page;
397 } 431 }
398 /* 432 /*
399 * page was already under io and is now done, great 433 * page was already under io and is now done, great
@@ -450,8 +484,8 @@ fill_it:
450 len = this_len; 484 len = this_len;
451 } 485 }
452 486
453 partial[page_nr].offset = loff; 487 spd.partial[page_nr].offset = loff;
454 partial[page_nr].len = this_len; 488 spd.partial[page_nr].len = this_len;
455 len -= this_len; 489 len -= this_len;
456 loff = 0; 490 loff = 0;
457 spd.nr_pages++; 491 spd.nr_pages++;
@@ -463,12 +497,13 @@ fill_it:
463 * we got, 'nr_pages' is how many pages are in the map. 497 * we got, 'nr_pages' is how many pages are in the map.
464 */ 498 */
465 while (page_nr < nr_pages) 499 while (page_nr < nr_pages)
466 page_cache_release(pages[page_nr++]); 500 page_cache_release(spd.pages[page_nr++]);
467 in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; 501 in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
468 502
469 if (spd.nr_pages) 503 if (spd.nr_pages)
470 return splice_to_pipe(pipe, &spd); 504 error = splice_to_pipe(pipe, &spd);
471 505
506 splice_shrink_spd(pipe, &spd);
472 return error; 507 return error;
473} 508}
474 509
@@ -559,9 +594,9 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
559 unsigned int nr_pages; 594 unsigned int nr_pages;
560 unsigned int nr_freed; 595 unsigned int nr_freed;
561 size_t offset; 596 size_t offset;
562 struct page *pages[PIPE_BUFFERS]; 597 struct page *pages[PIPE_DEF_BUFFERS];
563 struct partial_page partial[PIPE_BUFFERS]; 598 struct partial_page partial[PIPE_DEF_BUFFERS];
564 struct iovec vec[PIPE_BUFFERS]; 599 struct iovec *vec, __vec[PIPE_DEF_BUFFERS];
565 pgoff_t index; 600 pgoff_t index;
566 ssize_t res; 601 ssize_t res;
567 size_t this_len; 602 size_t this_len;
@@ -575,11 +610,22 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
575 .spd_release = spd_release_page, 610 .spd_release = spd_release_page,
576 }; 611 };
577 612
613 if (splice_grow_spd(pipe, &spd))
614 return -ENOMEM;
615
616 res = -ENOMEM;
617 vec = __vec;
618 if (pipe->buffers > PIPE_DEF_BUFFERS) {
619 vec = kmalloc(pipe->buffers * sizeof(struct iovec), GFP_KERNEL);
620 if (!vec)
621 goto shrink_ret;
622 }
623
578 index = *ppos >> PAGE_CACHE_SHIFT; 624 index = *ppos >> PAGE_CACHE_SHIFT;
579 offset = *ppos & ~PAGE_CACHE_MASK; 625 offset = *ppos & ~PAGE_CACHE_MASK;
580 nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 626 nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
581 627
582 for (i = 0; i < nr_pages && i < PIPE_BUFFERS && len; i++) { 628 for (i = 0; i < nr_pages && i < pipe->buffers && len; i++) {
583 struct page *page; 629 struct page *page;
584 630
585 page = alloc_page(GFP_USER); 631 page = alloc_page(GFP_USER);
@@ -590,7 +636,7 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
590 this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset); 636 this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset);
591 vec[i].iov_base = (void __user *) page_address(page); 637 vec[i].iov_base = (void __user *) page_address(page);
592 vec[i].iov_len = this_len; 638 vec[i].iov_len = this_len;
593 pages[i] = page; 639 spd.pages[i] = page;
594 spd.nr_pages++; 640 spd.nr_pages++;
595 len -= this_len; 641 len -= this_len;
596 offset = 0; 642 offset = 0;
@@ -609,11 +655,11 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
609 nr_freed = 0; 655 nr_freed = 0;
610 for (i = 0; i < spd.nr_pages; i++) { 656 for (i = 0; i < spd.nr_pages; i++) {
611 this_len = min_t(size_t, vec[i].iov_len, res); 657 this_len = min_t(size_t, vec[i].iov_len, res);
612 partial[i].offset = 0; 658 spd.partial[i].offset = 0;
613 partial[i].len = this_len; 659 spd.partial[i].len = this_len;
614 if (!this_len) { 660 if (!this_len) {
615 __free_page(pages[i]); 661 __free_page(spd.pages[i]);
616 pages[i] = NULL; 662 spd.pages[i] = NULL;
617 nr_freed++; 663 nr_freed++;
618 } 664 }
619 res -= this_len; 665 res -= this_len;
@@ -624,13 +670,18 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
624 if (res > 0) 670 if (res > 0)
625 *ppos += res; 671 *ppos += res;
626 672
673shrink_ret:
674 if (vec != __vec)
675 kfree(vec);
676 splice_shrink_spd(pipe, &spd);
627 return res; 677 return res;
628 678
629err: 679err:
630 for (i = 0; i < spd.nr_pages; i++) 680 for (i = 0; i < spd.nr_pages; i++)
631 __free_page(pages[i]); 681 __free_page(spd.pages[i]);
632 682
633 return error; 683 res = error;
684 goto shrink_ret;
634} 685}
635EXPORT_SYMBOL(default_file_splice_read); 686EXPORT_SYMBOL(default_file_splice_read);
636 687
@@ -783,7 +834,7 @@ int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
783 if (!buf->len) { 834 if (!buf->len) {
784 buf->ops = NULL; 835 buf->ops = NULL;
785 ops->release(pipe, buf); 836 ops->release(pipe, buf);
786 pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1); 837 pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
787 pipe->nrbufs--; 838 pipe->nrbufs--;
788 if (pipe->inode) 839 if (pipe->inode)
789 sd->need_wakeup = true; 840 sd->need_wakeup = true;
@@ -1210,7 +1261,7 @@ out_release:
1210 * If we did an incomplete transfer we must release 1261 * If we did an incomplete transfer we must release
1211 * the pipe buffers in question: 1262 * the pipe buffers in question:
1212 */ 1263 */
1213 for (i = 0; i < PIPE_BUFFERS; i++) { 1264 for (i = 0; i < pipe->buffers; i++) {
1214 struct pipe_buffer *buf = pipe->bufs + i; 1265 struct pipe_buffer *buf = pipe->bufs + i;
1215 1266
1216 if (buf->ops) { 1267 if (buf->ops) {
@@ -1370,7 +1421,8 @@ static long do_splice(struct file *in, loff_t __user *off_in,
1370 */ 1421 */
1371static int get_iovec_page_array(const struct iovec __user *iov, 1422static int get_iovec_page_array(const struct iovec __user *iov,
1372 unsigned int nr_vecs, struct page **pages, 1423 unsigned int nr_vecs, struct page **pages,
1373 struct partial_page *partial, int aligned) 1424 struct partial_page *partial, int aligned,
1425 unsigned int pipe_buffers)
1374{ 1426{
1375 int buffers = 0, error = 0; 1427 int buffers = 0, error = 0;
1376 1428
@@ -1413,8 +1465,8 @@ static int get_iovec_page_array(const struct iovec __user *iov,
1413 break; 1465 break;
1414 1466
1415 npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 1467 npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
1416 if (npages > PIPE_BUFFERS - buffers) 1468 if (npages > pipe_buffers - buffers)
1417 npages = PIPE_BUFFERS - buffers; 1469 npages = pipe_buffers - buffers;
1418 1470
1419 error = get_user_pages_fast((unsigned long)base, npages, 1471 error = get_user_pages_fast((unsigned long)base, npages,
1420 0, &pages[buffers]); 1472 0, &pages[buffers]);
@@ -1449,7 +1501,7 @@ static int get_iovec_page_array(const struct iovec __user *iov,
1449 * or if we mapped the max number of pages that we have 1501 * or if we mapped the max number of pages that we have
1450 * room for. 1502 * room for.
1451 */ 1503 */
1452 if (error < npages || buffers == PIPE_BUFFERS) 1504 if (error < npages || buffers == pipe_buffers)
1453 break; 1505 break;
1454 1506
1455 nr_vecs--; 1507 nr_vecs--;
@@ -1592,8 +1644,8 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
1592 unsigned long nr_segs, unsigned int flags) 1644 unsigned long nr_segs, unsigned int flags)
1593{ 1645{
1594 struct pipe_inode_info *pipe; 1646 struct pipe_inode_info *pipe;
1595 struct page *pages[PIPE_BUFFERS]; 1647 struct page *pages[PIPE_DEF_BUFFERS];
1596 struct partial_page partial[PIPE_BUFFERS]; 1648 struct partial_page partial[PIPE_DEF_BUFFERS];
1597 struct splice_pipe_desc spd = { 1649 struct splice_pipe_desc spd = {
1598 .pages = pages, 1650 .pages = pages,
1599 .partial = partial, 1651 .partial = partial,
@@ -1601,17 +1653,25 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
1601 .ops = &user_page_pipe_buf_ops, 1653 .ops = &user_page_pipe_buf_ops,
1602 .spd_release = spd_release_page, 1654 .spd_release = spd_release_page,
1603 }; 1655 };
1656 long ret;
1604 1657
1605 pipe = pipe_info(file->f_path.dentry->d_inode); 1658 pipe = pipe_info(file->f_path.dentry->d_inode);
1606 if (!pipe) 1659 if (!pipe)
1607 return -EBADF; 1660 return -EBADF;
1608 1661
1609 spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial, 1662 if (splice_grow_spd(pipe, &spd))
1610 flags & SPLICE_F_GIFT); 1663 return -ENOMEM;
1664
1665 spd.nr_pages = get_iovec_page_array(iov, nr_segs, spd.pages,
1666 spd.partial, flags & SPLICE_F_GIFT,
1667 pipe->buffers);
1611 if (spd.nr_pages <= 0) 1668 if (spd.nr_pages <= 0)
1612 return spd.nr_pages; 1669 ret = spd.nr_pages;
1670 else
1671 ret = splice_to_pipe(pipe, &spd);
1613 1672
1614 return splice_to_pipe(pipe, &spd); 1673 splice_shrink_spd(pipe, &spd);
1674 return ret;
1615} 1675}
1616 1676
1617/* 1677/*
@@ -1737,13 +1797,13 @@ static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1737 * Check ->nrbufs without the inode lock first. This function 1797 * Check ->nrbufs without the inode lock first. This function
1738 * is speculative anyways, so missing one is ok. 1798 * is speculative anyways, so missing one is ok.
1739 */ 1799 */
1740 if (pipe->nrbufs < PIPE_BUFFERS) 1800 if (pipe->nrbufs < pipe->buffers)
1741 return 0; 1801 return 0;
1742 1802
1743 ret = 0; 1803 ret = 0;
1744 pipe_lock(pipe); 1804 pipe_lock(pipe);
1745 1805
1746 while (pipe->nrbufs >= PIPE_BUFFERS) { 1806 while (pipe->nrbufs >= pipe->buffers) {
1747 if (!pipe->readers) { 1807 if (!pipe->readers) {
1748 send_sig(SIGPIPE, current, 0); 1808 send_sig(SIGPIPE, current, 0);
1749 ret = -EPIPE; 1809 ret = -EPIPE;
@@ -1809,7 +1869,7 @@ retry:
1809 * Cannot make any progress, because either the input 1869 * Cannot make any progress, because either the input
1810 * pipe is empty or the output pipe is full. 1870 * pipe is empty or the output pipe is full.
1811 */ 1871 */
1812 if (!ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) { 1872 if (!ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) {
1813 /* Already processed some buffers, break */ 1873 /* Already processed some buffers, break */
1814 if (ret) 1874 if (ret)
1815 break; 1875 break;
@@ -1830,7 +1890,7 @@ retry:
1830 } 1890 }
1831 1891
1832 ibuf = ipipe->bufs + ipipe->curbuf; 1892 ibuf = ipipe->bufs + ipipe->curbuf;
1833 nbuf = (opipe->curbuf + opipe->nrbufs) % PIPE_BUFFERS; 1893 nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
1834 obuf = opipe->bufs + nbuf; 1894 obuf = opipe->bufs + nbuf;
1835 1895
1836 if (len >= ibuf->len) { 1896 if (len >= ibuf->len) {
@@ -1840,7 +1900,7 @@ retry:
1840 *obuf = *ibuf; 1900 *obuf = *ibuf;
1841 ibuf->ops = NULL; 1901 ibuf->ops = NULL;
1842 opipe->nrbufs++; 1902 opipe->nrbufs++;
1843 ipipe->curbuf = (ipipe->curbuf + 1) % PIPE_BUFFERS; 1903 ipipe->curbuf = (ipipe->curbuf + 1) & (ipipe->buffers - 1);
1844 ipipe->nrbufs--; 1904 ipipe->nrbufs--;
1845 input_wakeup = true; 1905 input_wakeup = true;
1846 } else { 1906 } else {
@@ -1913,11 +1973,11 @@ static int link_pipe(struct pipe_inode_info *ipipe,
1913 * If we have iterated all input buffers or ran out of 1973 * If we have iterated all input buffers or ran out of
1914 * output room, break. 1974 * output room, break.
1915 */ 1975 */
1916 if (i >= ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) 1976 if (i >= ipipe->nrbufs || opipe->nrbufs >= opipe->buffers)
1917 break; 1977 break;
1918 1978
1919 ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1)); 1979 ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (ipipe->buffers-1));
1920 nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1); 1980 nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
1921 1981
1922 /* 1982 /*
1923 * Get a reference to this pipe buffer, 1983 * Get a reference to this pipe buffer,
diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index 25a00d19d686..cc6ce8a84c21 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -26,6 +26,17 @@ config SQUASHFS
26 26
27 If unsure, say N. 27 If unsure, say N.
28 28
29config SQUASHFS_XATTRS
30 bool "Squashfs XATTR support"
31 depends on SQUASHFS
32 default n
33 help
34 Saying Y here includes support for extended attributes (xattrs).
35 Xattrs are name:value pairs associated with inodes by
36 the kernel or by users (see the attr(5) manual page).
37
38 If unsure, say N.
39
29config SQUASHFS_EMBEDDED 40config SQUASHFS_EMBEDDED
30 41
31 bool "Additional option for memory-constrained systems" 42 bool "Additional option for memory-constrained systems"
diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
index df8a19ef870d..2cee3e9fa452 100644
--- a/fs/squashfs/Makefile
+++ b/fs/squashfs/Makefile
@@ -5,3 +5,5 @@
5obj-$(CONFIG_SQUASHFS) += squashfs.o 5obj-$(CONFIG_SQUASHFS) += squashfs.o
6squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o 6squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
7squashfs-y += namei.o super.o symlink.o zlib_wrapper.o decompressor.o 7squashfs-y += namei.o super.o symlink.o zlib_wrapper.o decompressor.o
8squashfs-$(CONFIG_SQUASHFS_XATTRS) += xattr.o xattr_id.o
9
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 1cb0d81b164b..653c030eb840 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -87,9 +87,8 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
87 u64 cur_index = index >> msblk->devblksize_log2; 87 u64 cur_index = index >> msblk->devblksize_log2;
88 int bytes, compressed, b = 0, k = 0, page = 0, avail; 88 int bytes, compressed, b = 0, k = 0, page = 0, avail;
89 89
90 90 bh = kcalloc(((srclength + msblk->devblksize - 1)
91 bh = kcalloc((msblk->block_size >> msblk->devblksize_log2) + 1, 91 >> msblk->devblksize_log2) + 1, sizeof(*bh), GFP_KERNEL);
92 sizeof(*bh), GFP_KERNEL);
93 if (bh == NULL) 92 if (bh == NULL)
94 return -ENOMEM; 93 return -ENOMEM;
95 94
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index 49daaf669e41..62e63ad25075 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c
@@ -40,11 +40,13 @@
40 40
41#include <linux/fs.h> 41#include <linux/fs.h>
42#include <linux/vfs.h> 42#include <linux/vfs.h>
43#include <linux/xattr.h>
43 44
44#include "squashfs_fs.h" 45#include "squashfs_fs.h"
45#include "squashfs_fs_sb.h" 46#include "squashfs_fs_sb.h"
46#include "squashfs_fs_i.h" 47#include "squashfs_fs_i.h"
47#include "squashfs.h" 48#include "squashfs.h"
49#include "xattr.h"
48 50
49/* 51/*
50 * Initialise VFS inode with the base inode information common to all 52 * Initialise VFS inode with the base inode information common to all
@@ -111,6 +113,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
111 int err, type, offset = SQUASHFS_INODE_OFFSET(ino); 113 int err, type, offset = SQUASHFS_INODE_OFFSET(ino);
112 union squashfs_inode squashfs_ino; 114 union squashfs_inode squashfs_ino;
113 struct squashfs_base_inode *sqshb_ino = &squashfs_ino.base; 115 struct squashfs_base_inode *sqshb_ino = &squashfs_ino.base;
116 int xattr_id = SQUASHFS_INVALID_XATTR;
114 117
115 TRACE("Entered squashfs_read_inode\n"); 118 TRACE("Entered squashfs_read_inode\n");
116 119
@@ -199,8 +202,10 @@ int squashfs_read_inode(struct inode *inode, long long ino)
199 frag_offset = 0; 202 frag_offset = 0;
200 } 203 }
201 204
205 xattr_id = le32_to_cpu(sqsh_ino->xattr);
202 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); 206 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
203 inode->i_size = le64_to_cpu(sqsh_ino->file_size); 207 inode->i_size = le64_to_cpu(sqsh_ino->file_size);
208 inode->i_op = &squashfs_inode_ops;
204 inode->i_fop = &generic_ro_fops; 209 inode->i_fop = &generic_ro_fops;
205 inode->i_mode |= S_IFREG; 210 inode->i_mode |= S_IFREG;
206 inode->i_blocks = ((inode->i_size - 211 inode->i_blocks = ((inode->i_size -
@@ -251,6 +256,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
251 if (err < 0) 256 if (err < 0)
252 goto failed_read; 257 goto failed_read;
253 258
259 xattr_id = le32_to_cpu(sqsh_ino->xattr);
254 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); 260 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
255 inode->i_size = le32_to_cpu(sqsh_ino->file_size); 261 inode->i_size = le32_to_cpu(sqsh_ino->file_size);
256 inode->i_op = &squashfs_dir_inode_ops; 262 inode->i_op = &squashfs_dir_inode_ops;
@@ -280,21 +286,33 @@ int squashfs_read_inode(struct inode *inode, long long ino)
280 286
281 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); 287 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
282 inode->i_size = le32_to_cpu(sqsh_ino->symlink_size); 288 inode->i_size = le32_to_cpu(sqsh_ino->symlink_size);
283 inode->i_op = &page_symlink_inode_operations; 289 inode->i_op = &squashfs_symlink_inode_ops;
284 inode->i_data.a_ops = &squashfs_symlink_aops; 290 inode->i_data.a_ops = &squashfs_symlink_aops;
285 inode->i_mode |= S_IFLNK; 291 inode->i_mode |= S_IFLNK;
286 squashfs_i(inode)->start = block; 292 squashfs_i(inode)->start = block;
287 squashfs_i(inode)->offset = offset; 293 squashfs_i(inode)->offset = offset;
288 294
295 if (type == SQUASHFS_LSYMLINK_TYPE) {
296 __le32 xattr;
297
298 err = squashfs_read_metadata(sb, NULL, &block,
299 &offset, inode->i_size);
300 if (err < 0)
301 goto failed_read;
302 err = squashfs_read_metadata(sb, &xattr, &block,
303 &offset, sizeof(xattr));
304 if (err < 0)
305 goto failed_read;
306 xattr_id = le32_to_cpu(xattr);
307 }
308
289 TRACE("Symbolic link inode %x:%x, start_block %llx, offset " 309 TRACE("Symbolic link inode %x:%x, start_block %llx, offset "
290 "%x\n", SQUASHFS_INODE_BLK(ino), offset, 310 "%x\n", SQUASHFS_INODE_BLK(ino), offset,
291 block, offset); 311 block, offset);
292 break; 312 break;
293 } 313 }
294 case SQUASHFS_BLKDEV_TYPE: 314 case SQUASHFS_BLKDEV_TYPE:
295 case SQUASHFS_CHRDEV_TYPE: 315 case SQUASHFS_CHRDEV_TYPE: {
296 case SQUASHFS_LBLKDEV_TYPE:
297 case SQUASHFS_LCHRDEV_TYPE: {
298 struct squashfs_dev_inode *sqsh_ino = &squashfs_ino.dev; 316 struct squashfs_dev_inode *sqsh_ino = &squashfs_ino.dev;
299 unsigned int rdev; 317 unsigned int rdev;
300 318
@@ -315,10 +333,32 @@ int squashfs_read_inode(struct inode *inode, long long ino)
315 SQUASHFS_INODE_BLK(ino), offset, rdev); 333 SQUASHFS_INODE_BLK(ino), offset, rdev);
316 break; 334 break;
317 } 335 }
336 case SQUASHFS_LBLKDEV_TYPE:
337 case SQUASHFS_LCHRDEV_TYPE: {
338 struct squashfs_ldev_inode *sqsh_ino = &squashfs_ino.ldev;
339 unsigned int rdev;
340
341 err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
342 sizeof(*sqsh_ino));
343 if (err < 0)
344 goto failed_read;
345
346 if (type == SQUASHFS_LCHRDEV_TYPE)
347 inode->i_mode |= S_IFCHR;
348 else
349 inode->i_mode |= S_IFBLK;
350 xattr_id = le32_to_cpu(sqsh_ino->xattr);
351 inode->i_op = &squashfs_inode_ops;
352 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
353 rdev = le32_to_cpu(sqsh_ino->rdev);
354 init_special_inode(inode, inode->i_mode, new_decode_dev(rdev));
355
356 TRACE("Device inode %x:%x, rdev %x\n",
357 SQUASHFS_INODE_BLK(ino), offset, rdev);
358 break;
359 }
318 case SQUASHFS_FIFO_TYPE: 360 case SQUASHFS_FIFO_TYPE:
319 case SQUASHFS_SOCKET_TYPE: 361 case SQUASHFS_SOCKET_TYPE: {
320 case SQUASHFS_LFIFO_TYPE:
321 case SQUASHFS_LSOCKET_TYPE: {
322 struct squashfs_ipc_inode *sqsh_ino = &squashfs_ino.ipc; 362 struct squashfs_ipc_inode *sqsh_ino = &squashfs_ino.ipc;
323 363
324 err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, 364 err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
@@ -334,14 +374,52 @@ int squashfs_read_inode(struct inode *inode, long long ino)
334 init_special_inode(inode, inode->i_mode, 0); 374 init_special_inode(inode, inode->i_mode, 0);
335 break; 375 break;
336 } 376 }
377 case SQUASHFS_LFIFO_TYPE:
378 case SQUASHFS_LSOCKET_TYPE: {
379 struct squashfs_lipc_inode *sqsh_ino = &squashfs_ino.lipc;
380
381 err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
382 sizeof(*sqsh_ino));
383 if (err < 0)
384 goto failed_read;
385
386 if (type == SQUASHFS_LFIFO_TYPE)
387 inode->i_mode |= S_IFIFO;
388 else
389 inode->i_mode |= S_IFSOCK;
390 xattr_id = le32_to_cpu(sqsh_ino->xattr);
391 inode->i_op = &squashfs_inode_ops;
392 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
393 init_special_inode(inode, inode->i_mode, 0);
394 break;
395 }
337 default: 396 default:
338 ERROR("Unknown inode type %d in squashfs_iget!\n", type); 397 ERROR("Unknown inode type %d in squashfs_iget!\n", type);
339 return -EINVAL; 398 return -EINVAL;
340 } 399 }
341 400
401 if (xattr_id != SQUASHFS_INVALID_XATTR && msblk->xattr_id_table) {
402 err = squashfs_xattr_lookup(sb, xattr_id,
403 &squashfs_i(inode)->xattr_count,
404 &squashfs_i(inode)->xattr_size,
405 &squashfs_i(inode)->xattr);
406 if (err < 0)
407 goto failed_read;
408 inode->i_blocks += ((squashfs_i(inode)->xattr_size - 1) >> 9)
409 + 1;
410 } else
411 squashfs_i(inode)->xattr_count = 0;
412
342 return 0; 413 return 0;
343 414
344failed_read: 415failed_read:
345 ERROR("Unable to read inode 0x%llx\n", ino); 416 ERROR("Unable to read inode 0x%llx\n", ino);
346 return err; 417 return err;
347} 418}
419
420
421const struct inode_operations squashfs_inode_ops = {
422 .getxattr = generic_getxattr,
423 .listxattr = squashfs_listxattr
424};
425
diff --git a/fs/squashfs/namei.c b/fs/squashfs/namei.c
index 5266bd8ad932..7a9464d08cf6 100644
--- a/fs/squashfs/namei.c
+++ b/fs/squashfs/namei.c
@@ -57,11 +57,13 @@
57#include <linux/slab.h> 57#include <linux/slab.h>
58#include <linux/string.h> 58#include <linux/string.h>
59#include <linux/dcache.h> 59#include <linux/dcache.h>
60#include <linux/xattr.h>
60 61
61#include "squashfs_fs.h" 62#include "squashfs_fs.h"
62#include "squashfs_fs_sb.h" 63#include "squashfs_fs_sb.h"
63#include "squashfs_fs_i.h" 64#include "squashfs_fs_i.h"
64#include "squashfs.h" 65#include "squashfs.h"
66#include "xattr.h"
65 67
66/* 68/*
67 * Lookup name in the directory index, returning the location of the metadata 69 * Lookup name in the directory index, returning the location of the metadata
@@ -237,5 +239,7 @@ failed:
237 239
238 240
239const struct inode_operations squashfs_dir_inode_ops = { 241const struct inode_operations squashfs_dir_inode_ops = {
240 .lookup = squashfs_lookup 242 .lookup = squashfs_lookup,
243 .getxattr = generic_getxattr,
244 .listxattr = squashfs_listxattr
241}; 245};
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index fe2587af5512..733a17c42945 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -73,8 +73,11 @@ extern struct inode *squashfs_iget(struct super_block *, long long,
73 unsigned int); 73 unsigned int);
74extern int squashfs_read_inode(struct inode *, long long); 74extern int squashfs_read_inode(struct inode *, long long);
75 75
76/* xattr.c */
77extern ssize_t squashfs_listxattr(struct dentry *, char *, size_t);
78
76/* 79/*
77 * Inodes, files and decompressor operations 80 * Inodes, files, decompressor and xattr operations
78 */ 81 */
79 82
80/* dir.c */ 83/* dir.c */
@@ -86,11 +89,18 @@ extern const struct export_operations squashfs_export_ops;
86/* file.c */ 89/* file.c */
87extern const struct address_space_operations squashfs_aops; 90extern const struct address_space_operations squashfs_aops;
88 91
92/* inode.c */
93extern const struct inode_operations squashfs_inode_ops;
94
89/* namei.c */ 95/* namei.c */
90extern const struct inode_operations squashfs_dir_inode_ops; 96extern const struct inode_operations squashfs_dir_inode_ops;
91 97
92/* symlink.c */ 98/* symlink.c */
93extern const struct address_space_operations squashfs_symlink_aops; 99extern const struct address_space_operations squashfs_symlink_aops;
100extern const struct inode_operations squashfs_symlink_inode_ops;
101
102/* xattr.c */
103extern const struct xattr_handler *squashfs_xattr_handlers[];
94 104
95/* zlib_wrapper.c */ 105/* zlib_wrapper.c */
96extern const struct squashfs_decompressor squashfs_zlib_comp_ops; 106extern const struct squashfs_decompressor squashfs_zlib_comp_ops;
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index 79024245ea00..8eabb808b78d 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -46,6 +46,7 @@
46#define SQUASHFS_NAME_LEN 256 46#define SQUASHFS_NAME_LEN 256
47 47
48#define SQUASHFS_INVALID_FRAG (0xffffffffU) 48#define SQUASHFS_INVALID_FRAG (0xffffffffU)
49#define SQUASHFS_INVALID_XATTR (0xffffffffU)
49#define SQUASHFS_INVALID_BLK (-1LL) 50#define SQUASHFS_INVALID_BLK (-1LL)
50 51
51/* Filesystem flags */ 52/* Filesystem flags */
@@ -96,6 +97,13 @@
96#define SQUASHFS_LFIFO_TYPE 13 97#define SQUASHFS_LFIFO_TYPE 13
97#define SQUASHFS_LSOCKET_TYPE 14 98#define SQUASHFS_LSOCKET_TYPE 14
98 99
100/* Xattr types */
101#define SQUASHFS_XATTR_USER 0
102#define SQUASHFS_XATTR_TRUSTED 1
103#define SQUASHFS_XATTR_SECURITY 2
104#define SQUASHFS_XATTR_VALUE_OOL 256
105#define SQUASHFS_XATTR_PREFIX_MASK 0xff
106
99/* Flag whether block is compressed or uncompressed, bit is set if block is 107/* Flag whether block is compressed or uncompressed, bit is set if block is
100 * uncompressed */ 108 * uncompressed */
101#define SQUASHFS_COMPRESSED_BIT (1 << 15) 109#define SQUASHFS_COMPRESSED_BIT (1 << 15)
@@ -174,6 +182,24 @@
174 182
175#define SQUASHFS_ID_BLOCK_BYTES(A) (SQUASHFS_ID_BLOCKS(A) *\ 183#define SQUASHFS_ID_BLOCK_BYTES(A) (SQUASHFS_ID_BLOCKS(A) *\
176 sizeof(u64)) 184 sizeof(u64))
185/* xattr id lookup table defines */
186#define SQUASHFS_XATTR_BYTES(A) ((A) * sizeof(struct squashfs_xattr_id))
187
188#define SQUASHFS_XATTR_BLOCK(A) (SQUASHFS_XATTR_BYTES(A) / \
189 SQUASHFS_METADATA_SIZE)
190
191#define SQUASHFS_XATTR_BLOCK_OFFSET(A) (SQUASHFS_XATTR_BYTES(A) % \
192 SQUASHFS_METADATA_SIZE)
193
194#define SQUASHFS_XATTR_BLOCKS(A) ((SQUASHFS_XATTR_BYTES(A) + \
195 SQUASHFS_METADATA_SIZE - 1) / \
196 SQUASHFS_METADATA_SIZE)
197
198#define SQUASHFS_XATTR_BLOCK_BYTES(A) (SQUASHFS_XATTR_BLOCKS(A) *\
199 sizeof(u64))
200#define SQUASHFS_XATTR_BLK(A) ((unsigned int) ((A) >> 16))
201
202#define SQUASHFS_XATTR_OFFSET(A) ((unsigned int) ((A) & 0xffff))
177 203
178/* cached data constants for filesystem */ 204/* cached data constants for filesystem */
179#define SQUASHFS_CACHED_BLKS 8 205#define SQUASHFS_CACHED_BLKS 8
@@ -228,7 +254,7 @@ struct squashfs_super_block {
228 __le64 root_inode; 254 __le64 root_inode;
229 __le64 bytes_used; 255 __le64 bytes_used;
230 __le64 id_table_start; 256 __le64 id_table_start;
231 __le64 xattr_table_start; 257 __le64 xattr_id_table_start;
232 __le64 inode_table_start; 258 __le64 inode_table_start;
233 __le64 directory_table_start; 259 __le64 directory_table_start;
234 __le64 fragment_table_start; 260 __le64 fragment_table_start;
@@ -261,6 +287,17 @@ struct squashfs_ipc_inode {
261 __le32 nlink; 287 __le32 nlink;
262}; 288};
263 289
290struct squashfs_lipc_inode {
291 __le16 inode_type;
292 __le16 mode;
293 __le16 uid;
294 __le16 guid;
295 __le32 mtime;
296 __le32 inode_number;
297 __le32 nlink;
298 __le32 xattr;
299};
300
264struct squashfs_dev_inode { 301struct squashfs_dev_inode {
265 __le16 inode_type; 302 __le16 inode_type;
266 __le16 mode; 303 __le16 mode;
@@ -272,6 +309,18 @@ struct squashfs_dev_inode {
272 __le32 rdev; 309 __le32 rdev;
273}; 310};
274 311
312struct squashfs_ldev_inode {
313 __le16 inode_type;
314 __le16 mode;
315 __le16 uid;
316 __le16 guid;
317 __le32 mtime;
318 __le32 inode_number;
319 __le32 nlink;
320 __le32 rdev;
321 __le32 xattr;
322};
323
275struct squashfs_symlink_inode { 324struct squashfs_symlink_inode {
276 __le16 inode_type; 325 __le16 inode_type;
277 __le16 mode; 326 __le16 mode;
@@ -349,12 +398,14 @@ struct squashfs_ldir_inode {
349union squashfs_inode { 398union squashfs_inode {
350 struct squashfs_base_inode base; 399 struct squashfs_base_inode base;
351 struct squashfs_dev_inode dev; 400 struct squashfs_dev_inode dev;
401 struct squashfs_ldev_inode ldev;
352 struct squashfs_symlink_inode symlink; 402 struct squashfs_symlink_inode symlink;
353 struct squashfs_reg_inode reg; 403 struct squashfs_reg_inode reg;
354 struct squashfs_lreg_inode lreg; 404 struct squashfs_lreg_inode lreg;
355 struct squashfs_dir_inode dir; 405 struct squashfs_dir_inode dir;
356 struct squashfs_ldir_inode ldir; 406 struct squashfs_ldir_inode ldir;
357 struct squashfs_ipc_inode ipc; 407 struct squashfs_ipc_inode ipc;
408 struct squashfs_lipc_inode lipc;
358}; 409};
359 410
360struct squashfs_dir_entry { 411struct squashfs_dir_entry {
@@ -377,4 +428,27 @@ struct squashfs_fragment_entry {
377 unsigned int unused; 428 unsigned int unused;
378}; 429};
379 430
431struct squashfs_xattr_entry {
432 __le16 type;
433 __le16 size;
434 char data[0];
435};
436
437struct squashfs_xattr_val {
438 __le32 vsize;
439 char value[0];
440};
441
442struct squashfs_xattr_id {
443 __le64 xattr;
444 __le32 count;
445 __le32 size;
446};
447
448struct squashfs_xattr_id_table {
449 __le64 xattr_table_start;
450 __le32 xattr_ids;
451 __le32 unused;
452};
453
380#endif 454#endif
diff --git a/fs/squashfs/squashfs_fs_i.h b/fs/squashfs/squashfs_fs_i.h
index fbfca30c0c68..d3e3a37f28a1 100644
--- a/fs/squashfs/squashfs_fs_i.h
+++ b/fs/squashfs/squashfs_fs_i.h
@@ -26,6 +26,9 @@
26struct squashfs_inode_info { 26struct squashfs_inode_info {
27 u64 start; 27 u64 start;
28 int offset; 28 int offset;
29 u64 xattr;
30 unsigned int xattr_size;
31 int xattr_count;
29 union { 32 union {
30 struct { 33 struct {
31 u64 fragment_block; 34 u64 fragment_block;
diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h
index 2e77dc547e25..d9037a5215f0 100644
--- a/fs/squashfs/squashfs_fs_sb.h
+++ b/fs/squashfs/squashfs_fs_sb.h
@@ -61,6 +61,7 @@ struct squashfs_sb_info {
61 int next_meta_index; 61 int next_meta_index;
62 __le64 *id_table; 62 __le64 *id_table;
63 __le64 *fragment_index; 63 __le64 *fragment_index;
64 __le64 *xattr_id_table;
64 struct mutex read_data_mutex; 65 struct mutex read_data_mutex;
65 struct mutex meta_index_mutex; 66 struct mutex meta_index_mutex;
66 struct meta_index *meta_index; 67 struct meta_index *meta_index;
@@ -68,9 +69,11 @@ struct squashfs_sb_info {
68 __le64 *inode_lookup_table; 69 __le64 *inode_lookup_table;
69 u64 inode_table; 70 u64 inode_table;
70 u64 directory_table; 71 u64 directory_table;
72 u64 xattr_table;
71 unsigned int block_size; 73 unsigned int block_size;
72 unsigned short block_log; 74 unsigned short block_log;
73 long long bytes_used; 75 long long bytes_used;
74 unsigned int inodes; 76 unsigned int inodes;
77 int xattr_ids;
75}; 78};
76#endif 79#endif
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 3550aec2f655..88b4f8606652 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -36,12 +36,14 @@
36#include <linux/init.h> 36#include <linux/init.h>
37#include <linux/module.h> 37#include <linux/module.h>
38#include <linux/magic.h> 38#include <linux/magic.h>
39#include <linux/xattr.h>
39 40
40#include "squashfs_fs.h" 41#include "squashfs_fs.h"
41#include "squashfs_fs_sb.h" 42#include "squashfs_fs_sb.h"
42#include "squashfs_fs_i.h" 43#include "squashfs_fs_i.h"
43#include "squashfs.h" 44#include "squashfs.h"
44#include "decompressor.h" 45#include "decompressor.h"
46#include "xattr.h"
45 47
46static struct file_system_type squashfs_fs_type; 48static struct file_system_type squashfs_fs_type;
47static const struct super_operations squashfs_super_ops; 49static const struct super_operations squashfs_super_ops;
@@ -82,7 +84,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
82 long long root_inode; 84 long long root_inode;
83 unsigned short flags; 85 unsigned short flags;
84 unsigned int fragments; 86 unsigned int fragments;
85 u64 lookup_table_start; 87 u64 lookup_table_start, xattr_id_table_start;
86 int err; 88 int err;
87 89
88 TRACE("Entered squashfs_fill_superblock\n"); 90 TRACE("Entered squashfs_fill_superblock\n");
@@ -139,13 +141,6 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
139 if (msblk->decompressor == NULL) 141 if (msblk->decompressor == NULL)
140 goto failed_mount; 142 goto failed_mount;
141 143
142 /*
143 * Check if there's xattrs in the filesystem. These are not
144 * supported in this version, so warn that they will be ignored.
145 */
146 if (le64_to_cpu(sblk->xattr_table_start) != SQUASHFS_INVALID_BLK)
147 ERROR("Xattrs in filesystem, these will be ignored\n");
148
149 /* Check the filesystem does not extend beyond the end of the 144 /* Check the filesystem does not extend beyond the end of the
150 block device */ 145 block device */
151 msblk->bytes_used = le64_to_cpu(sblk->bytes_used); 146 msblk->bytes_used = le64_to_cpu(sblk->bytes_used);
@@ -253,7 +248,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
253allocate_lookup_table: 248allocate_lookup_table:
254 lookup_table_start = le64_to_cpu(sblk->lookup_table_start); 249 lookup_table_start = le64_to_cpu(sblk->lookup_table_start);
255 if (lookup_table_start == SQUASHFS_INVALID_BLK) 250 if (lookup_table_start == SQUASHFS_INVALID_BLK)
256 goto allocate_root; 251 goto allocate_xattr_table;
257 252
258 /* Allocate and read inode lookup table */ 253 /* Allocate and read inode lookup table */
259 msblk->inode_lookup_table = squashfs_read_inode_lookup_table(sb, 254 msblk->inode_lookup_table = squashfs_read_inode_lookup_table(sb,
@@ -266,6 +261,21 @@ allocate_lookup_table:
266 261
267 sb->s_export_op = &squashfs_export_ops; 262 sb->s_export_op = &squashfs_export_ops;
268 263
264allocate_xattr_table:
265 sb->s_xattr = squashfs_xattr_handlers;
266 xattr_id_table_start = le64_to_cpu(sblk->xattr_id_table_start);
267 if (xattr_id_table_start == SQUASHFS_INVALID_BLK)
268 goto allocate_root;
269
270 /* Allocate and read xattr id lookup table */
271 msblk->xattr_id_table = squashfs_read_xattr_id_table(sb,
272 xattr_id_table_start, &msblk->xattr_table, &msblk->xattr_ids);
273 if (IS_ERR(msblk->xattr_id_table)) {
274 err = PTR_ERR(msblk->xattr_id_table);
275 msblk->xattr_id_table = NULL;
276 if (err != -ENOTSUPP)
277 goto failed_mount;
278 }
269allocate_root: 279allocate_root:
270 root = new_inode(sb); 280 root = new_inode(sb);
271 if (!root) { 281 if (!root) {
@@ -275,7 +285,8 @@ allocate_root:
275 285
276 err = squashfs_read_inode(root, root_inode); 286 err = squashfs_read_inode(root, root_inode);
277 if (err) { 287 if (err) {
278 iget_failed(root); 288 make_bad_inode(root);
289 iput(root);
279 goto failed_mount; 290 goto failed_mount;
280 } 291 }
281 insert_inode_hash(root); 292 insert_inode_hash(root);
@@ -300,6 +311,7 @@ failed_mount:
300 kfree(msblk->inode_lookup_table); 311 kfree(msblk->inode_lookup_table);
301 kfree(msblk->fragment_index); 312 kfree(msblk->fragment_index);
302 kfree(msblk->id_table); 313 kfree(msblk->id_table);
314 kfree(msblk->xattr_id_table);
303 kfree(sb->s_fs_info); 315 kfree(sb->s_fs_info);
304 sb->s_fs_info = NULL; 316 sb->s_fs_info = NULL;
305 kfree(sblk); 317 kfree(sblk);
@@ -353,6 +365,8 @@ static void squashfs_put_super(struct super_block *sb)
353 kfree(sbi->id_table); 365 kfree(sbi->id_table);
354 kfree(sbi->fragment_index); 366 kfree(sbi->fragment_index);
355 kfree(sbi->meta_index); 367 kfree(sbi->meta_index);
368 kfree(sbi->inode_lookup_table);
369 kfree(sbi->xattr_id_table);
356 kfree(sb->s_fs_info); 370 kfree(sb->s_fs_info);
357 sb->s_fs_info = NULL; 371 sb->s_fs_info = NULL;
358 } 372 }
diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c
index e80be2022a7f..ec86434921e1 100644
--- a/fs/squashfs/symlink.c
+++ b/fs/squashfs/symlink.c
@@ -33,14 +33,15 @@
33#include <linux/fs.h> 33#include <linux/fs.h>
34#include <linux/vfs.h> 34#include <linux/vfs.h>
35#include <linux/kernel.h> 35#include <linux/kernel.h>
36#include <linux/slab.h>
37#include <linux/string.h> 36#include <linux/string.h>
38#include <linux/pagemap.h> 37#include <linux/pagemap.h>
38#include <linux/xattr.h>
39 39
40#include "squashfs_fs.h" 40#include "squashfs_fs.h"
41#include "squashfs_fs_sb.h" 41#include "squashfs_fs_sb.h"
42#include "squashfs_fs_i.h" 42#include "squashfs_fs_i.h"
43#include "squashfs.h" 43#include "squashfs.h"
44#include "xattr.h"
44 45
45static int squashfs_symlink_readpage(struct file *file, struct page *page) 46static int squashfs_symlink_readpage(struct file *file, struct page *page)
46{ 47{
@@ -115,3 +116,12 @@ error_out:
115const struct address_space_operations squashfs_symlink_aops = { 116const struct address_space_operations squashfs_symlink_aops = {
116 .readpage = squashfs_symlink_readpage 117 .readpage = squashfs_symlink_readpage
117}; 118};
119
120const struct inode_operations squashfs_symlink_inode_ops = {
121 .readlink = generic_readlink,
122 .follow_link = page_follow_link_light,
123 .put_link = page_put_link,
124 .getxattr = generic_getxattr,
125 .listxattr = squashfs_listxattr
126};
127
diff --git a/fs/squashfs/xattr.c b/fs/squashfs/xattr.c
new file mode 100644
index 000000000000..c7655e8b31cd
--- /dev/null
+++ b/fs/squashfs/xattr.c
@@ -0,0 +1,323 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2010
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * xattr_id.c
22 */
23
24#include <linux/init.h>
25#include <linux/module.h>
26#include <linux/string.h>
27#include <linux/fs.h>
28#include <linux/vfs.h>
29#include <linux/xattr.h>
30#include <linux/slab.h>
31
32#include "squashfs_fs.h"
33#include "squashfs_fs_sb.h"
34#include "squashfs_fs_i.h"
35#include "squashfs.h"
36
37static const struct xattr_handler *squashfs_xattr_handler(int);
38
39ssize_t squashfs_listxattr(struct dentry *d, char *buffer,
40 size_t buffer_size)
41{
42 struct inode *inode = d->d_inode;
43 struct super_block *sb = inode->i_sb;
44 struct squashfs_sb_info *msblk = sb->s_fs_info;
45 u64 start = SQUASHFS_XATTR_BLK(squashfs_i(inode)->xattr)
46 + msblk->xattr_table;
47 int offset = SQUASHFS_XATTR_OFFSET(squashfs_i(inode)->xattr);
48 int count = squashfs_i(inode)->xattr_count;
49 size_t rest = buffer_size;
50 int err;
51
52 /* check that the file system has xattrs */
53 if (msblk->xattr_id_table == NULL)
54 return -EOPNOTSUPP;
55
56 /* loop reading each xattr name */
57 while (count--) {
58 struct squashfs_xattr_entry entry;
59 struct squashfs_xattr_val val;
60 const struct xattr_handler *handler;
61 int name_size, prefix_size = 0;
62
63 err = squashfs_read_metadata(sb, &entry, &start, &offset,
64 sizeof(entry));
65 if (err < 0)
66 goto failed;
67
68 name_size = le16_to_cpu(entry.size);
69 handler = squashfs_xattr_handler(le16_to_cpu(entry.type));
70 if (handler)
71 prefix_size = handler->list(d, buffer, rest, NULL,
72 name_size, handler->flags);
73 if (prefix_size) {
74 if (buffer) {
75 if (prefix_size + name_size + 1 > rest) {
76 err = -ERANGE;
77 goto failed;
78 }
79 buffer += prefix_size;
80 }
81 err = squashfs_read_metadata(sb, buffer, &start,
82 &offset, name_size);
83 if (err < 0)
84 goto failed;
85 if (buffer) {
86 buffer[name_size] = '\0';
87 buffer += name_size + 1;
88 }
89 rest -= prefix_size + name_size + 1;
90 } else {
91 /* no handler or insuffficient privileges, so skip */
92 err = squashfs_read_metadata(sb, NULL, &start,
93 &offset, name_size);
94 if (err < 0)
95 goto failed;
96 }
97
98
99 /* skip remaining xattr entry */
100 err = squashfs_read_metadata(sb, &val, &start, &offset,
101 sizeof(val));
102 if (err < 0)
103 goto failed;
104
105 err = squashfs_read_metadata(sb, NULL, &start, &offset,
106 le32_to_cpu(val.vsize));
107 if (err < 0)
108 goto failed;
109 }
110 err = buffer_size - rest;
111
112failed:
113 return err;
114}
115
116
117static int squashfs_xattr_get(struct inode *inode, int name_index,
118 const char *name, void *buffer, size_t buffer_size)
119{
120 struct super_block *sb = inode->i_sb;
121 struct squashfs_sb_info *msblk = sb->s_fs_info;
122 u64 start = SQUASHFS_XATTR_BLK(squashfs_i(inode)->xattr)
123 + msblk->xattr_table;
124 int offset = SQUASHFS_XATTR_OFFSET(squashfs_i(inode)->xattr);
125 int count = squashfs_i(inode)->xattr_count;
126 int name_len = strlen(name);
127 int err, vsize;
128 char *target = kmalloc(name_len, GFP_KERNEL);
129
130 if (target == NULL)
131 return -ENOMEM;
132
133 /* loop reading each xattr name */
134 for (; count; count--) {
135 struct squashfs_xattr_entry entry;
136 struct squashfs_xattr_val val;
137 int type, prefix, name_size;
138
139 err = squashfs_read_metadata(sb, &entry, &start, &offset,
140 sizeof(entry));
141 if (err < 0)
142 goto failed;
143
144 name_size = le16_to_cpu(entry.size);
145 type = le16_to_cpu(entry.type);
146 prefix = type & SQUASHFS_XATTR_PREFIX_MASK;
147
148 if (prefix == name_index && name_size == name_len)
149 err = squashfs_read_metadata(sb, target, &start,
150 &offset, name_size);
151 else
152 err = squashfs_read_metadata(sb, NULL, &start,
153 &offset, name_size);
154 if (err < 0)
155 goto failed;
156
157 if (prefix == name_index && name_size == name_len &&
158 strncmp(target, name, name_size) == 0) {
159 /* found xattr */
160 if (type & SQUASHFS_XATTR_VALUE_OOL) {
161 __le64 xattr;
162 /* val is a reference to the real location */
163 err = squashfs_read_metadata(sb, &val, &start,
164 &offset, sizeof(val));
165 if (err < 0)
166 goto failed;
167 err = squashfs_read_metadata(sb, &xattr, &start,
168 &offset, sizeof(xattr));
169 if (err < 0)
170 goto failed;
171 xattr = le64_to_cpu(xattr);
172 start = SQUASHFS_XATTR_BLK(xattr) +
173 msblk->xattr_table;
174 offset = SQUASHFS_XATTR_OFFSET(xattr);
175 }
176 /* read xattr value */
177 err = squashfs_read_metadata(sb, &val, &start, &offset,
178 sizeof(val));
179 if (err < 0)
180 goto failed;
181
182 vsize = le32_to_cpu(val.vsize);
183 if (buffer) {
184 if (vsize > buffer_size) {
185 err = -ERANGE;
186 goto failed;
187 }
188 err = squashfs_read_metadata(sb, buffer, &start,
189 &offset, vsize);
190 if (err < 0)
191 goto failed;
192 }
193 break;
194 }
195
196 /* no match, skip remaining xattr entry */
197 err = squashfs_read_metadata(sb, &val, &start, &offset,
198 sizeof(val));
199 if (err < 0)
200 goto failed;
201 err = squashfs_read_metadata(sb, NULL, &start, &offset,
202 le32_to_cpu(val.vsize));
203 if (err < 0)
204 goto failed;
205 }
206 err = count ? vsize : -ENODATA;
207
208failed:
209 kfree(target);
210 return err;
211}
212
213
214/*
215 * User namespace support
216 */
217static size_t squashfs_user_list(struct dentry *d, char *list, size_t list_size,
218 const char *name, size_t name_len, int type)
219{
220 if (list && XATTR_USER_PREFIX_LEN <= list_size)
221 memcpy(list, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
222 return XATTR_USER_PREFIX_LEN;
223}
224
225static int squashfs_user_get(struct dentry *d, const char *name, void *buffer,
226 size_t size, int type)
227{
228 if (name[0] == '\0')
229 return -EINVAL;
230
231 return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_USER, name,
232 buffer, size);
233}
234
235static const struct xattr_handler squashfs_xattr_user_handler = {
236 .prefix = XATTR_USER_PREFIX,
237 .list = squashfs_user_list,
238 .get = squashfs_user_get
239};
240
241/*
242 * Trusted namespace support
243 */
244static size_t squashfs_trusted_list(struct dentry *d, char *list,
245 size_t list_size, const char *name, size_t name_len, int type)
246{
247 if (!capable(CAP_SYS_ADMIN))
248 return 0;
249
250 if (list && XATTR_TRUSTED_PREFIX_LEN <= list_size)
251 memcpy(list, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
252 return XATTR_TRUSTED_PREFIX_LEN;
253}
254
255static int squashfs_trusted_get(struct dentry *d, const char *name,
256 void *buffer, size_t size, int type)
257{
258 if (name[0] == '\0')
259 return -EINVAL;
260
261 return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_TRUSTED, name,
262 buffer, size);
263}
264
265static const struct xattr_handler squashfs_xattr_trusted_handler = {
266 .prefix = XATTR_TRUSTED_PREFIX,
267 .list = squashfs_trusted_list,
268 .get = squashfs_trusted_get
269};
270
271/*
272 * Security namespace support
273 */
274static size_t squashfs_security_list(struct dentry *d, char *list,
275 size_t list_size, const char *name, size_t name_len, int type)
276{
277 if (list && XATTR_SECURITY_PREFIX_LEN <= list_size)
278 memcpy(list, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN);
279 return XATTR_SECURITY_PREFIX_LEN;
280}
281
282static int squashfs_security_get(struct dentry *d, const char *name,
283 void *buffer, size_t size, int type)
284{
285 if (name[0] == '\0')
286 return -EINVAL;
287
288 return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_SECURITY, name,
289 buffer, size);
290}
291
292static const struct xattr_handler squashfs_xattr_security_handler = {
293 .prefix = XATTR_SECURITY_PREFIX,
294 .list = squashfs_security_list,
295 .get = squashfs_security_get
296};
297
298static inline const struct xattr_handler *squashfs_xattr_handler(int type)
299{
300 if (type & ~(SQUASHFS_XATTR_PREFIX_MASK | SQUASHFS_XATTR_VALUE_OOL))
301 /* ignore unrecognised type */
302 return NULL;
303
304 switch (type & SQUASHFS_XATTR_PREFIX_MASK) {
305 case SQUASHFS_XATTR_USER:
306 return &squashfs_xattr_user_handler;
307 case SQUASHFS_XATTR_TRUSTED:
308 return &squashfs_xattr_trusted_handler;
309 case SQUASHFS_XATTR_SECURITY:
310 return &squashfs_xattr_security_handler;
311 default:
312 /* ignore unrecognised type */
313 return NULL;
314 }
315}
316
317const struct xattr_handler *squashfs_xattr_handlers[] = {
318 &squashfs_xattr_user_handler,
319 &squashfs_xattr_trusted_handler,
320 &squashfs_xattr_security_handler,
321 NULL
322};
323
diff --git a/fs/squashfs/xattr.h b/fs/squashfs/xattr.h
new file mode 100644
index 000000000000..9da071ae181c
--- /dev/null
+++ b/fs/squashfs/xattr.h
@@ -0,0 +1,46 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2010
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * xattr.h
22 */
23
24#ifdef CONFIG_SQUASHFS_XATTRS
25extern __le64 *squashfs_read_xattr_id_table(struct super_block *, u64,
26 u64 *, int *);
27extern int squashfs_xattr_lookup(struct super_block *, unsigned int, int *,
28 int *, unsigned long long *);
29#else
30static inline __le64 *squashfs_read_xattr_id_table(struct super_block *sb,
31 u64 start, u64 *xattr_table_start, int *xattr_ids)
32{
33 ERROR("Xattrs in filesystem, these will be ignored\n");
34 return ERR_PTR(-ENOTSUPP);
35}
36
37static inline int squashfs_xattr_lookup(struct super_block *sb,
38 unsigned int index, int *count, int *size,
39 unsigned long long *xattr)
40{
41 return 0;
42}
43#define squashfs_listxattr NULL
44#define generic_getxattr NULL
45#define squashfs_xattr_handlers NULL
46#endif
diff --git a/fs/squashfs/xattr_id.c b/fs/squashfs/xattr_id.c
new file mode 100644
index 000000000000..cfb41106098f
--- /dev/null
+++ b/fs/squashfs/xattr_id.c
@@ -0,0 +1,100 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2010
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * xattr_id.c
22 */
23
24/*
25 * This file implements code to map the 32-bit xattr id stored in the inode
26 * into the on disk location of the xattr data.
27 */
28
29#include <linux/fs.h>
30#include <linux/vfs.h>
31#include <linux/slab.h>
32
33#include "squashfs_fs.h"
34#include "squashfs_fs_sb.h"
35#include "squashfs_fs_i.h"
36#include "squashfs.h"
37
38/*
39 * Map xattr id using the xattr id look up table
40 */
41int squashfs_xattr_lookup(struct super_block *sb, unsigned int index,
42 int *count, unsigned int *size, unsigned long long *xattr)
43{
44 struct squashfs_sb_info *msblk = sb->s_fs_info;
45 int block = SQUASHFS_XATTR_BLOCK(index);
46 int offset = SQUASHFS_XATTR_BLOCK_OFFSET(index);
47 u64 start_block = le64_to_cpu(msblk->xattr_id_table[block]);
48 struct squashfs_xattr_id id;
49 int err;
50
51 err = squashfs_read_metadata(sb, &id, &start_block, &offset,
52 sizeof(id));
53 if (err < 0)
54 return err;
55
56 *xattr = le64_to_cpu(id.xattr);
57 *size = le32_to_cpu(id.size);
58 *count = le32_to_cpu(id.count);
59 return 0;
60}
61
62
63/*
64 * Read uncompressed xattr id lookup table indexes from disk into memory
65 */
66__le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 start,
67 u64 *xattr_table_start, int *xattr_ids)
68{
69 unsigned int len;
70 __le64 *xid_table;
71 struct squashfs_xattr_id_table id_table;
72 int err;
73
74 err = squashfs_read_table(sb, &id_table, start, sizeof(id_table));
75 if (err < 0) {
76 ERROR("unable to read xattr id table\n");
77 return ERR_PTR(err);
78 }
79 *xattr_table_start = le64_to_cpu(id_table.xattr_table_start);
80 *xattr_ids = le32_to_cpu(id_table.xattr_ids);
81 len = SQUASHFS_XATTR_BLOCK_BYTES(*xattr_ids);
82
83 TRACE("In read_xattr_index_table, length %d\n", len);
84
85 /* Allocate xattr id lookup table indexes */
86 xid_table = kmalloc(len, GFP_KERNEL);
87 if (xid_table == NULL) {
88 ERROR("Failed to allocate xattr id index table\n");
89 return ERR_PTR(-ENOMEM);
90 }
91
92 err = squashfs_read_table(sb, xid_table, start + sizeof(id_table), len);
93 if (err < 0) {
94 ERROR("unable to read xattr id index table\n");
95 kfree(xid_table);
96 return ERR_PTR(err);
97 }
98
99 return xid_table;
100}
diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c
index 4dd70e04333b..7a603874e483 100644
--- a/fs/squashfs/zlib_wrapper.c
+++ b/fs/squashfs/zlib_wrapper.c
@@ -24,6 +24,7 @@
24 24
25#include <linux/mutex.h> 25#include <linux/mutex.h>
26#include <linux/buffer_head.h> 26#include <linux/buffer_head.h>
27#include <linux/slab.h>
27#include <linux/zlib.h> 28#include <linux/zlib.h>
28 29
29#include "squashfs_fs.h" 30#include "squashfs_fs.h"
@@ -127,8 +128,9 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer,
127 goto release_mutex; 128 goto release_mutex;
128 } 129 }
129 130
131 length = stream->total_out;
130 mutex_unlock(&msblk->read_data_mutex); 132 mutex_unlock(&msblk->read_data_mutex);
131 return stream->total_out; 133 return length;
132 134
133release_mutex: 135release_mutex:
134 mutex_unlock(&msblk->read_data_mutex); 136 mutex_unlock(&msblk->read_data_mutex);
diff --git a/fs/statfs.c b/fs/statfs.c
new file mode 100644
index 000000000000..4ef021f3b612
--- /dev/null
+++ b/fs/statfs.c
@@ -0,0 +1,196 @@
1#include <linux/syscalls.h>
2#include <linux/module.h>
3#include <linux/fs.h>
4#include <linux/file.h>
5#include <linux/namei.h>
6#include <linux/statfs.h>
7#include <linux/security.h>
8#include <linux/uaccess.h>
9
10int vfs_statfs(struct dentry *dentry, struct kstatfs *buf)
11{
12 int retval = -ENODEV;
13
14 if (dentry) {
15 retval = -ENOSYS;
16 if (dentry->d_sb->s_op->statfs) {
17 memset(buf, 0, sizeof(*buf));
18 retval = security_sb_statfs(dentry);
19 if (retval)
20 return retval;
21 retval = dentry->d_sb->s_op->statfs(dentry, buf);
22 if (retval == 0 && buf->f_frsize == 0)
23 buf->f_frsize = buf->f_bsize;
24 }
25 }
26 return retval;
27}
28
29EXPORT_SYMBOL(vfs_statfs);
30
31static int vfs_statfs_native(struct dentry *dentry, struct statfs *buf)
32{
33 struct kstatfs st;
34 int retval;
35
36 retval = vfs_statfs(dentry, &st);
37 if (retval)
38 return retval;
39
40 if (sizeof(*buf) == sizeof(st))
41 memcpy(buf, &st, sizeof(st));
42 else {
43 if (sizeof buf->f_blocks == 4) {
44 if ((st.f_blocks | st.f_bfree | st.f_bavail |
45 st.f_bsize | st.f_frsize) &
46 0xffffffff00000000ULL)
47 return -EOVERFLOW;
48 /*
49 * f_files and f_ffree may be -1; it's okay to stuff
50 * that into 32 bits
51 */
52 if (st.f_files != -1 &&
53 (st.f_files & 0xffffffff00000000ULL))
54 return -EOVERFLOW;
55 if (st.f_ffree != -1 &&
56 (st.f_ffree & 0xffffffff00000000ULL))
57 return -EOVERFLOW;
58 }
59
60 buf->f_type = st.f_type;
61 buf->f_bsize = st.f_bsize;
62 buf->f_blocks = st.f_blocks;
63 buf->f_bfree = st.f_bfree;
64 buf->f_bavail = st.f_bavail;
65 buf->f_files = st.f_files;
66 buf->f_ffree = st.f_ffree;
67 buf->f_fsid = st.f_fsid;
68 buf->f_namelen = st.f_namelen;
69 buf->f_frsize = st.f_frsize;
70 memset(buf->f_spare, 0, sizeof(buf->f_spare));
71 }
72 return 0;
73}
74
75static int vfs_statfs64(struct dentry *dentry, struct statfs64 *buf)
76{
77 struct kstatfs st;
78 int retval;
79
80 retval = vfs_statfs(dentry, &st);
81 if (retval)
82 return retval;
83
84 if (sizeof(*buf) == sizeof(st))
85 memcpy(buf, &st, sizeof(st));
86 else {
87 buf->f_type = st.f_type;
88 buf->f_bsize = st.f_bsize;
89 buf->f_blocks = st.f_blocks;
90 buf->f_bfree = st.f_bfree;
91 buf->f_bavail = st.f_bavail;
92 buf->f_files = st.f_files;
93 buf->f_ffree = st.f_ffree;
94 buf->f_fsid = st.f_fsid;
95 buf->f_namelen = st.f_namelen;
96 buf->f_frsize = st.f_frsize;
97 memset(buf->f_spare, 0, sizeof(buf->f_spare));
98 }
99 return 0;
100}
101
102SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct statfs __user *, buf)
103{
104 struct path path;
105 int error;
106
107 error = user_path(pathname, &path);
108 if (!error) {
109 struct statfs tmp;
110 error = vfs_statfs_native(path.dentry, &tmp);
111 if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
112 error = -EFAULT;
113 path_put(&path);
114 }
115 return error;
116}
117
118SYSCALL_DEFINE3(statfs64, const char __user *, pathname, size_t, sz, struct statfs64 __user *, buf)
119{
120 struct path path;
121 long error;
122
123 if (sz != sizeof(*buf))
124 return -EINVAL;
125 error = user_path(pathname, &path);
126 if (!error) {
127 struct statfs64 tmp;
128 error = vfs_statfs64(path.dentry, &tmp);
129 if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
130 error = -EFAULT;
131 path_put(&path);
132 }
133 return error;
134}
135
136SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct statfs __user *, buf)
137{
138 struct file *file;
139 struct statfs tmp;
140 int error;
141
142 error = -EBADF;
143 file = fget(fd);
144 if (!file)
145 goto out;
146 error = vfs_statfs_native(file->f_path.dentry, &tmp);
147 if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
148 error = -EFAULT;
149 fput(file);
150out:
151 return error;
152}
153
154SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user *, buf)
155{
156 struct file *file;
157 struct statfs64 tmp;
158 int error;
159
160 if (sz != sizeof(*buf))
161 return -EINVAL;
162
163 error = -EBADF;
164 file = fget(fd);
165 if (!file)
166 goto out;
167 error = vfs_statfs64(file->f_path.dentry, &tmp);
168 if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
169 error = -EFAULT;
170 fput(file);
171out:
172 return error;
173}
174
175SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf)
176{
177 struct super_block *s;
178 struct ustat tmp;
179 struct kstatfs sbuf;
180 int err;
181
182 s = user_get_super(new_decode_dev(dev));
183 if (!s)
184 return -EINVAL;
185
186 err = vfs_statfs(s->s_root, &sbuf);
187 drop_super(s);
188 if (err)
189 return err;
190
191 memset(&tmp,0,sizeof(struct ustat));
192 tmp.f_tfree = sbuf.f_bfree;
193 tmp.f_tinode = sbuf.f_ffree;
194
195 return copy_to_user(ubuf, &tmp, sizeof(struct ustat)) ? -EFAULT : 0;
196}
diff --git a/fs/super.c b/fs/super.c
index f35ac6022109..5c35bc7a499e 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -22,22 +22,14 @@
22 22
23#include <linux/module.h> 23#include <linux/module.h>
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/init.h>
26#include <linux/smp_lock.h>
27#include <linux/acct.h> 25#include <linux/acct.h>
28#include <linux/blkdev.h> 26#include <linux/blkdev.h>
29#include <linux/quotaops.h>
30#include <linux/namei.h>
31#include <linux/mount.h> 27#include <linux/mount.h>
32#include <linux/security.h> 28#include <linux/security.h>
33#include <linux/syscalls.h>
34#include <linux/vfs.h>
35#include <linux/writeback.h> /* for the emergency remount stuff */ 29#include <linux/writeback.h> /* for the emergency remount stuff */
36#include <linux/idr.h> 30#include <linux/idr.h>
37#include <linux/kobject.h>
38#include <linux/mutex.h> 31#include <linux/mutex.h>
39#include <linux/file.h> 32#include <linux/backing-dev.h>
40#include <asm/uaccess.h>
41#include "internal.h" 33#include "internal.h"
42 34
43 35
@@ -92,16 +84,15 @@ static struct super_block *alloc_super(struct file_system_type *type)
92 * subclass. 84 * subclass.
93 */ 85 */
94 down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING); 86 down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING);
95 s->s_count = S_BIAS; 87 s->s_count = 1;
96 atomic_set(&s->s_active, 1); 88 atomic_set(&s->s_active, 1);
97 mutex_init(&s->s_vfs_rename_mutex); 89 mutex_init(&s->s_vfs_rename_mutex);
90 lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key);
98 mutex_init(&s->s_dquot.dqio_mutex); 91 mutex_init(&s->s_dquot.dqio_mutex);
99 mutex_init(&s->s_dquot.dqonoff_mutex); 92 mutex_init(&s->s_dquot.dqonoff_mutex);
100 init_rwsem(&s->s_dquot.dqptr_sem); 93 init_rwsem(&s->s_dquot.dqptr_sem);
101 init_waitqueue_head(&s->s_wait_unfrozen); 94 init_waitqueue_head(&s->s_wait_unfrozen);
102 s->s_maxbytes = MAX_NON_LFS; 95 s->s_maxbytes = MAX_NON_LFS;
103 s->dq_op = sb_dquot_ops;
104 s->s_qcop = sb_quotactl_ops;
105 s->s_op = &default_op; 96 s->s_op = &default_op;
106 s->s_time_gran = 1000000000; 97 s->s_time_gran = 1000000000;
107 } 98 }
@@ -126,39 +117,14 @@ static inline void destroy_super(struct super_block *s)
126/* Superblock refcounting */ 117/* Superblock refcounting */
127 118
128/* 119/*
129 * Drop a superblock's refcount. Returns non-zero if the superblock was 120 * Drop a superblock's refcount. The caller must hold sb_lock.
130 * destroyed. The caller must hold sb_lock.
131 */ 121 */
132static int __put_super(struct super_block *sb) 122void __put_super(struct super_block *sb)
133{ 123{
134 int ret = 0;
135
136 if (!--sb->s_count) { 124 if (!--sb->s_count) {
125 list_del_init(&sb->s_list);
137 destroy_super(sb); 126 destroy_super(sb);
138 ret = 1;
139 }
140 return ret;
141}
142
143/*
144 * Drop a superblock's refcount.
145 * Returns non-zero if the superblock is about to be destroyed and
146 * at least is already removed from super_blocks list, so if we are
147 * making a loop through super blocks then we need to restart.
148 * The caller must hold sb_lock.
149 */
150int __put_super_and_need_restart(struct super_block *sb)
151{
152 /* check for race with generic_shutdown_super() */
153 if (list_empty(&sb->s_list)) {
154 /* super block is removed, need to restart... */
155 __put_super(sb);
156 return 1;
157 } 127 }
158 /* can't be the last, since s_list is still in use */
159 sb->s_count--;
160 BUG_ON(sb->s_count == 0);
161 return 0;
162} 128}
163 129
164/** 130/**
@@ -177,57 +143,47 @@ void put_super(struct super_block *sb)
177 143
178 144
179/** 145/**
180 * deactivate_super - drop an active reference to superblock 146 * deactivate_locked_super - drop an active reference to superblock
181 * @s: superblock to deactivate 147 * @s: superblock to deactivate
182 * 148 *
183 * Drops an active reference to superblock, acquiring a temprory one if 149 * Drops an active reference to superblock, converting it into a temprory
184 * there is no active references left. In that case we lock superblock, 150 * one if there is no other active references left. In that case we
185 * tell fs driver to shut it down and drop the temporary reference we 151 * tell fs driver to shut it down and drop the temporary reference we
186 * had just acquired. 152 * had just acquired.
153 *
154 * Caller holds exclusive lock on superblock; that lock is released.
187 */ 155 */
188void deactivate_super(struct super_block *s) 156void deactivate_locked_super(struct super_block *s)
189{ 157{
190 struct file_system_type *fs = s->s_type; 158 struct file_system_type *fs = s->s_type;
191 if (atomic_dec_and_lock(&s->s_active, &sb_lock)) { 159 if (atomic_dec_and_test(&s->s_active)) {
192 s->s_count -= S_BIAS-1;
193 spin_unlock(&sb_lock);
194 vfs_dq_off(s, 0);
195 down_write(&s->s_umount);
196 fs->kill_sb(s); 160 fs->kill_sb(s);
197 put_filesystem(fs); 161 put_filesystem(fs);
198 put_super(s); 162 put_super(s);
163 } else {
164 up_write(&s->s_umount);
199 } 165 }
200} 166}
201 167
202EXPORT_SYMBOL(deactivate_super); 168EXPORT_SYMBOL(deactivate_locked_super);
203 169
204/** 170/**
205 * deactivate_locked_super - drop an active reference to superblock 171 * deactivate_super - drop an active reference to superblock
206 * @s: superblock to deactivate 172 * @s: superblock to deactivate
207 * 173 *
208 * Equivalent of up_write(&s->s_umount); deactivate_super(s);, except that 174 * Variant of deactivate_locked_super(), except that superblock is *not*
209 * it does not unlock it until it's all over. As the result, it's safe to 175 * locked by caller. If we are going to drop the final active reference,
210 * use to dispose of new superblock on ->get_sb() failure exits - nobody 176 * lock will be acquired prior to that.
211 * will see the sucker until it's all over. Equivalent using up_write +
212 * deactivate_super is safe for that purpose only if superblock is either
213 * safe to use or has NULL ->s_root when we unlock.
214 */ 177 */
215void deactivate_locked_super(struct super_block *s) 178void deactivate_super(struct super_block *s)
216{ 179{
217 struct file_system_type *fs = s->s_type; 180 if (!atomic_add_unless(&s->s_active, -1, 1)) {
218 if (atomic_dec_and_lock(&s->s_active, &sb_lock)) { 181 down_write(&s->s_umount);
219 s->s_count -= S_BIAS-1; 182 deactivate_locked_super(s);
220 spin_unlock(&sb_lock);
221 vfs_dq_off(s, 0);
222 fs->kill_sb(s);
223 put_filesystem(fs);
224 put_super(s);
225 } else {
226 up_write(&s->s_umount);
227 } 183 }
228} 184}
229 185
230EXPORT_SYMBOL(deactivate_locked_super); 186EXPORT_SYMBOL(deactivate_super);
231 187
232/** 188/**
233 * grab_super - acquire an active reference 189 * grab_super - acquire an active reference
@@ -242,22 +198,17 @@ EXPORT_SYMBOL(deactivate_locked_super);
242 */ 198 */
243static int grab_super(struct super_block *s) __releases(sb_lock) 199static int grab_super(struct super_block *s) __releases(sb_lock)
244{ 200{
201 if (atomic_inc_not_zero(&s->s_active)) {
202 spin_unlock(&sb_lock);
203 return 1;
204 }
205 /* it's going away */
245 s->s_count++; 206 s->s_count++;
246 spin_unlock(&sb_lock); 207 spin_unlock(&sb_lock);
208 /* wait for it to die */
247 down_write(&s->s_umount); 209 down_write(&s->s_umount);
248 if (s->s_root) {
249 spin_lock(&sb_lock);
250 if (s->s_count > S_BIAS) {
251 atomic_inc(&s->s_active);
252 s->s_count--;
253 spin_unlock(&sb_lock);
254 return 1;
255 }
256 spin_unlock(&sb_lock);
257 }
258 up_write(&s->s_umount); 210 up_write(&s->s_umount);
259 put_super(s); 211 put_super(s);
260 yield();
261 return 0; 212 return 0;
262} 213}
263 214
@@ -320,8 +271,7 @@ void generic_shutdown_super(struct super_block *sb)
320 } 271 }
321 spin_lock(&sb_lock); 272 spin_lock(&sb_lock);
322 /* should be initialized for __put_super_and_need_restart() */ 273 /* should be initialized for __put_super_and_need_restart() */
323 list_del_init(&sb->s_list); 274 list_del_init(&sb->s_instances);
324 list_del(&sb->s_instances);
325 spin_unlock(&sb_lock); 275 spin_unlock(&sb_lock);
326 up_write(&sb->s_umount); 276 up_write(&sb->s_umount);
327} 277}
@@ -356,6 +306,7 @@ retry:
356 up_write(&s->s_umount); 306 up_write(&s->s_umount);
357 destroy_super(s); 307 destroy_super(s);
358 } 308 }
309 down_write(&old->s_umount);
359 return old; 310 return old;
360 } 311 }
361 } 312 }
@@ -407,11 +358,12 @@ EXPORT_SYMBOL(drop_super);
407 */ 358 */
408void sync_supers(void) 359void sync_supers(void)
409{ 360{
410 struct super_block *sb; 361 struct super_block *sb, *n;
411 362
412 spin_lock(&sb_lock); 363 spin_lock(&sb_lock);
413restart: 364 list_for_each_entry_safe(sb, n, &super_blocks, s_list) {
414 list_for_each_entry(sb, &super_blocks, s_list) { 365 if (list_empty(&sb->s_instances))
366 continue;
415 if (sb->s_op->write_super && sb->s_dirt) { 367 if (sb->s_op->write_super && sb->s_dirt) {
416 sb->s_count++; 368 sb->s_count++;
417 spin_unlock(&sb_lock); 369 spin_unlock(&sb_lock);
@@ -422,14 +374,43 @@ restart:
422 up_read(&sb->s_umount); 374 up_read(&sb->s_umount);
423 375
424 spin_lock(&sb_lock); 376 spin_lock(&sb_lock);
425 if (__put_super_and_need_restart(sb)) 377 __put_super(sb);
426 goto restart;
427 } 378 }
428 } 379 }
429 spin_unlock(&sb_lock); 380 spin_unlock(&sb_lock);
430} 381}
431 382
432/** 383/**
384 * iterate_supers - call function for all active superblocks
385 * @f: function to call
386 * @arg: argument to pass to it
387 *
388 * Scans the superblock list and calls given function, passing it
389 * locked superblock and given argument.
390 */
391void iterate_supers(void (*f)(struct super_block *, void *), void *arg)
392{
393 struct super_block *sb, *n;
394
395 spin_lock(&sb_lock);
396 list_for_each_entry_safe(sb, n, &super_blocks, s_list) {
397 if (list_empty(&sb->s_instances))
398 continue;
399 sb->s_count++;
400 spin_unlock(&sb_lock);
401
402 down_read(&sb->s_umount);
403 if (sb->s_root)
404 f(sb, arg);
405 up_read(&sb->s_umount);
406
407 spin_lock(&sb_lock);
408 __put_super(sb);
409 }
410 spin_unlock(&sb_lock);
411}
412
413/**
433 * get_super - get the superblock of a device 414 * get_super - get the superblock of a device
434 * @bdev: device to get the superblock for 415 * @bdev: device to get the superblock for
435 * 416 *
@@ -437,7 +418,7 @@ restart:
437 * mounted on the device given. %NULL is returned if no match is found. 418 * mounted on the device given. %NULL is returned if no match is found.
438 */ 419 */
439 420
440struct super_block * get_super(struct block_device *bdev) 421struct super_block *get_super(struct block_device *bdev)
441{ 422{
442 struct super_block *sb; 423 struct super_block *sb;
443 424
@@ -447,17 +428,20 @@ struct super_block * get_super(struct block_device *bdev)
447 spin_lock(&sb_lock); 428 spin_lock(&sb_lock);
448rescan: 429rescan:
449 list_for_each_entry(sb, &super_blocks, s_list) { 430 list_for_each_entry(sb, &super_blocks, s_list) {
431 if (list_empty(&sb->s_instances))
432 continue;
450 if (sb->s_bdev == bdev) { 433 if (sb->s_bdev == bdev) {
451 sb->s_count++; 434 sb->s_count++;
452 spin_unlock(&sb_lock); 435 spin_unlock(&sb_lock);
453 down_read(&sb->s_umount); 436 down_read(&sb->s_umount);
437 /* still alive? */
454 if (sb->s_root) 438 if (sb->s_root)
455 return sb; 439 return sb;
456 up_read(&sb->s_umount); 440 up_read(&sb->s_umount);
457 /* restart only when sb is no longer on the list */ 441 /* nope, got unmounted */
458 spin_lock(&sb_lock); 442 spin_lock(&sb_lock);
459 if (__put_super_and_need_restart(sb)) 443 __put_super(sb);
460 goto rescan; 444 goto rescan;
461 } 445 }
462 } 446 }
463 spin_unlock(&sb_lock); 447 spin_unlock(&sb_lock);
@@ -472,7 +456,7 @@ EXPORT_SYMBOL(get_super);
472 * 456 *
473 * Scans the superblock list and finds the superblock of the file system 457 * Scans the superblock list and finds the superblock of the file system
474 * mounted on the device given. Returns the superblock with an active 458 * mounted on the device given. Returns the superblock with an active
475 * reference and s_umount held exclusively or %NULL if none was found. 459 * reference or %NULL if none was found.
476 */ 460 */
477struct super_block *get_active_super(struct block_device *bdev) 461struct super_block *get_active_super(struct block_device *bdev)
478{ 462{
@@ -481,81 +465,49 @@ struct super_block *get_active_super(struct block_device *bdev)
481 if (!bdev) 465 if (!bdev)
482 return NULL; 466 return NULL;
483 467
468restart:
484 spin_lock(&sb_lock); 469 spin_lock(&sb_lock);
485 list_for_each_entry(sb, &super_blocks, s_list) { 470 list_for_each_entry(sb, &super_blocks, s_list) {
486 if (sb->s_bdev != bdev) 471 if (list_empty(&sb->s_instances))
487 continue; 472 continue;
488 473 if (sb->s_bdev == bdev) {
489 sb->s_count++; 474 if (grab_super(sb)) /* drops sb_lock */
490 spin_unlock(&sb_lock);
491 down_write(&sb->s_umount);
492 if (sb->s_root) {
493 spin_lock(&sb_lock);
494 if (sb->s_count > S_BIAS) {
495 atomic_inc(&sb->s_active);
496 sb->s_count--;
497 spin_unlock(&sb_lock);
498 return sb; 475 return sb;
499 } 476 else
500 spin_unlock(&sb_lock); 477 goto restart;
501 } 478 }
502 up_write(&sb->s_umount);
503 put_super(sb);
504 yield();
505 spin_lock(&sb_lock);
506 } 479 }
507 spin_unlock(&sb_lock); 480 spin_unlock(&sb_lock);
508 return NULL; 481 return NULL;
509} 482}
510 483
511struct super_block * user_get_super(dev_t dev) 484struct super_block *user_get_super(dev_t dev)
512{ 485{
513 struct super_block *sb; 486 struct super_block *sb;
514 487
515 spin_lock(&sb_lock); 488 spin_lock(&sb_lock);
516rescan: 489rescan:
517 list_for_each_entry(sb, &super_blocks, s_list) { 490 list_for_each_entry(sb, &super_blocks, s_list) {
491 if (list_empty(&sb->s_instances))
492 continue;
518 if (sb->s_dev == dev) { 493 if (sb->s_dev == dev) {
519 sb->s_count++; 494 sb->s_count++;
520 spin_unlock(&sb_lock); 495 spin_unlock(&sb_lock);
521 down_read(&sb->s_umount); 496 down_read(&sb->s_umount);
497 /* still alive? */
522 if (sb->s_root) 498 if (sb->s_root)
523 return sb; 499 return sb;
524 up_read(&sb->s_umount); 500 up_read(&sb->s_umount);
525 /* restart only when sb is no longer on the list */ 501 /* nope, got unmounted */
526 spin_lock(&sb_lock); 502 spin_lock(&sb_lock);
527 if (__put_super_and_need_restart(sb)) 503 __put_super(sb);
528 goto rescan; 504 goto rescan;
529 } 505 }
530 } 506 }
531 spin_unlock(&sb_lock); 507 spin_unlock(&sb_lock);
532 return NULL; 508 return NULL;
533} 509}
534 510
535SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf)
536{
537 struct super_block *s;
538 struct ustat tmp;
539 struct kstatfs sbuf;
540 int err = -EINVAL;
541
542 s = user_get_super(new_decode_dev(dev));
543 if (s == NULL)
544 goto out;
545 err = vfs_statfs(s->s_root, &sbuf);
546 drop_super(s);
547 if (err)
548 goto out;
549
550 memset(&tmp,0,sizeof(struct ustat));
551 tmp.f_tfree = sbuf.f_bfree;
552 tmp.f_tinode = sbuf.f_ffree;
553
554 err = copy_to_user(ubuf,&tmp,sizeof(struct ustat)) ? -EFAULT : 0;
555out:
556 return err;
557}
558
559/** 511/**
560 * do_remount_sb - asks filesystem to change mount options. 512 * do_remount_sb - asks filesystem to change mount options.
561 * @sb: superblock in question 513 * @sb: superblock in question
@@ -568,7 +520,7 @@ out:
568int do_remount_sb(struct super_block *sb, int flags, void *data, int force) 520int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
569{ 521{
570 int retval; 522 int retval;
571 int remount_rw, remount_ro; 523 int remount_ro;
572 524
573 if (sb->s_frozen != SB_UNFROZEN) 525 if (sb->s_frozen != SB_UNFROZEN)
574 return -EBUSY; 526 return -EBUSY;
@@ -584,7 +536,6 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
584 sync_filesystem(sb); 536 sync_filesystem(sb);
585 537
586 remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY); 538 remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);
587 remount_rw = !(flags & MS_RDONLY) && (sb->s_flags & MS_RDONLY);
588 539
589 /* If we are remounting RDONLY and current sb is read/write, 540 /* If we are remounting RDONLY and current sb is read/write,
590 make sure there are no rw files opened */ 541 make sure there are no rw files opened */
@@ -593,9 +544,6 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
593 mark_files_ro(sb); 544 mark_files_ro(sb);
594 else if (!fs_may_remount_ro(sb)) 545 else if (!fs_may_remount_ro(sb))
595 return -EBUSY; 546 return -EBUSY;
596 retval = vfs_dq_off(sb, 1);
597 if (retval < 0 && retval != -ENOSYS)
598 return -EBUSY;
599 } 547 }
600 548
601 if (sb->s_op->remount_fs) { 549 if (sb->s_op->remount_fs) {
@@ -604,8 +552,7 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
604 return retval; 552 return retval;
605 } 553 }
606 sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK); 554 sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK);
607 if (remount_rw) 555
608 vfs_dq_quota_on_remount(sb);
609 /* 556 /*
610 * Some filesystems modify their metadata via some other path than the 557 * Some filesystems modify their metadata via some other path than the
611 * bdev buffer cache (eg. use a private mapping, or directories in 558 * bdev buffer cache (eg. use a private mapping, or directories in
@@ -621,24 +568,24 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
621 568
622static void do_emergency_remount(struct work_struct *work) 569static void do_emergency_remount(struct work_struct *work)
623{ 570{
624 struct super_block *sb; 571 struct super_block *sb, *n;
625 572
626 spin_lock(&sb_lock); 573 spin_lock(&sb_lock);
627 list_for_each_entry(sb, &super_blocks, s_list) { 574 list_for_each_entry_safe(sb, n, &super_blocks, s_list) {
575 if (list_empty(&sb->s_instances))
576 continue;
628 sb->s_count++; 577 sb->s_count++;
629 spin_unlock(&sb_lock); 578 spin_unlock(&sb_lock);
630 down_write(&sb->s_umount); 579 down_write(&sb->s_umount);
631 if (sb->s_root && sb->s_bdev && !(sb->s_flags & MS_RDONLY)) { 580 if (sb->s_root && sb->s_bdev && !(sb->s_flags & MS_RDONLY)) {
632 /* 581 /*
633 * ->remount_fs needs lock_kernel().
634 *
635 * What lock protects sb->s_flags?? 582 * What lock protects sb->s_flags??
636 */ 583 */
637 do_remount_sb(sb, MS_RDONLY, NULL, 1); 584 do_remount_sb(sb, MS_RDONLY, NULL, 1);
638 } 585 }
639 up_write(&sb->s_umount); 586 up_write(&sb->s_umount);
640 put_super(sb);
641 spin_lock(&sb_lock); 587 spin_lock(&sb_lock);
588 __put_super(sb);
642 } 589 }
643 spin_unlock(&sb_lock); 590 spin_unlock(&sb_lock);
644 kfree(work); 591 kfree(work);
@@ -693,6 +640,7 @@ int set_anon_super(struct super_block *s, void *data)
693 return -EMFILE; 640 return -EMFILE;
694 } 641 }
695 s->s_dev = MKDEV(0, dev & MINORMASK); 642 s->s_dev = MKDEV(0, dev & MINORMASK);
643 s->s_bdi = &noop_backing_dev_info;
696 return 0; 644 return 0;
697} 645}
698 646
@@ -954,10 +902,11 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
954 if (error < 0) 902 if (error < 0)
955 goto out_free_secdata; 903 goto out_free_secdata;
956 BUG_ON(!mnt->mnt_sb); 904 BUG_ON(!mnt->mnt_sb);
905 WARN_ON(!mnt->mnt_sb->s_bdi);
957 906
958 error = security_sb_kern_mount(mnt->mnt_sb, flags, secdata); 907 error = security_sb_kern_mount(mnt->mnt_sb, flags, secdata);
959 if (error) 908 if (error)
960 goto out_sb; 909 goto out_sb;
961 910
962 /* 911 /*
963 * filesystems should never set s_maxbytes larger than MAX_LFS_FILESIZE 912 * filesystems should never set s_maxbytes larger than MAX_LFS_FILESIZE
@@ -987,6 +936,96 @@ out:
987 936
988EXPORT_SYMBOL_GPL(vfs_kern_mount); 937EXPORT_SYMBOL_GPL(vfs_kern_mount);
989 938
939/**
940 * freeze_super - lock the filesystem and force it into a consistent state
941 * @sb: the super to lock
942 *
943 * Syncs the super to make sure the filesystem is consistent and calls the fs's
944 * freeze_fs. Subsequent calls to this without first thawing the fs will return
945 * -EBUSY.
946 */
947int freeze_super(struct super_block *sb)
948{
949 int ret;
950
951 atomic_inc(&sb->s_active);
952 down_write(&sb->s_umount);
953 if (sb->s_frozen) {
954 deactivate_locked_super(sb);
955 return -EBUSY;
956 }
957
958 if (sb->s_flags & MS_RDONLY) {
959 sb->s_frozen = SB_FREEZE_TRANS;
960 smp_wmb();
961 up_write(&sb->s_umount);
962 return 0;
963 }
964
965 sb->s_frozen = SB_FREEZE_WRITE;
966 smp_wmb();
967
968 sync_filesystem(sb);
969
970 sb->s_frozen = SB_FREEZE_TRANS;
971 smp_wmb();
972
973 sync_blockdev(sb->s_bdev);
974 if (sb->s_op->freeze_fs) {
975 ret = sb->s_op->freeze_fs(sb);
976 if (ret) {
977 printk(KERN_ERR
978 "VFS:Filesystem freeze failed\n");
979 sb->s_frozen = SB_UNFROZEN;
980 deactivate_locked_super(sb);
981 return ret;
982 }
983 }
984 up_write(&sb->s_umount);
985 return 0;
986}
987EXPORT_SYMBOL(freeze_super);
988
989/**
990 * thaw_super -- unlock filesystem
991 * @sb: the super to thaw
992 *
993 * Unlocks the filesystem and marks it writeable again after freeze_super().
994 */
995int thaw_super(struct super_block *sb)
996{
997 int error;
998
999 down_write(&sb->s_umount);
1000 if (sb->s_frozen == SB_UNFROZEN) {
1001 up_write(&sb->s_umount);
1002 return -EINVAL;
1003 }
1004
1005 if (sb->s_flags & MS_RDONLY)
1006 goto out;
1007
1008 if (sb->s_op->unfreeze_fs) {
1009 error = sb->s_op->unfreeze_fs(sb);
1010 if (error) {
1011 printk(KERN_ERR
1012 "VFS:Filesystem thaw failed\n");
1013 sb->s_frozen = SB_FREEZE_TRANS;
1014 up_write(&sb->s_umount);
1015 return error;
1016 }
1017 }
1018
1019out:
1020 sb->s_frozen = SB_UNFROZEN;
1021 smp_wmb();
1022 wake_up(&sb->s_wait_unfrozen);
1023 deactivate_locked_super(sb);
1024
1025 return 0;
1026}
1027EXPORT_SYMBOL(thaw_super);
1028
990static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype) 1029static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)
991{ 1030{
992 int err; 1031 int err;
diff --git a/fs/sync.c b/fs/sync.c
index f557d71cb097..c9f83f480ec5 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -5,6 +5,7 @@
5#include <linux/kernel.h> 5#include <linux/kernel.h>
6#include <linux/file.h> 6#include <linux/file.h>
7#include <linux/fs.h> 7#include <linux/fs.h>
8#include <linux/slab.h>
8#include <linux/module.h> 9#include <linux/module.h>
9#include <linux/sched.h> 10#include <linux/sched.h>
10#include <linux/writeback.h> 11#include <linux/writeback.h>
@@ -13,6 +14,7 @@
13#include <linux/pagemap.h> 14#include <linux/pagemap.h>
14#include <linux/quotaops.h> 15#include <linux/quotaops.h>
15#include <linux/buffer_head.h> 16#include <linux/buffer_head.h>
17#include <linux/backing-dev.h>
16#include "internal.h" 18#include "internal.h"
17 19
18#define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \ 20#define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \
@@ -31,7 +33,7 @@ static int __sync_filesystem(struct super_block *sb, int wait)
31 * This should be safe, as we require bdi backing to actually 33 * This should be safe, as we require bdi backing to actually
32 * write out data in the first place 34 * write out data in the first place
33 */ 35 */
34 if (!sb->s_bdi) 36 if (!sb->s_bdi || sb->s_bdi == &noop_backing_dev_info)
35 return 0; 37 return 0;
36 38
37 if (sb->s_qcop && sb->s_qcop->quota_sync) 39 if (sb->s_qcop && sb->s_qcop->quota_sync)
@@ -40,7 +42,7 @@ static int __sync_filesystem(struct super_block *sb, int wait)
40 if (wait) 42 if (wait)
41 sync_inodes_sb(sb); 43 sync_inodes_sb(sb);
42 else 44 else
43 writeback_inodes_sb(sb); 45 writeback_inodes_sb_locked(sb);
44 46
45 if (sb->s_op->sync_fs) 47 if (sb->s_op->sync_fs)
46 sb->s_op->sync_fs(sb, wait); 48 sb->s_op->sync_fs(sb, wait);
@@ -75,50 +77,18 @@ int sync_filesystem(struct super_block *sb)
75} 77}
76EXPORT_SYMBOL_GPL(sync_filesystem); 78EXPORT_SYMBOL_GPL(sync_filesystem);
77 79
80static void sync_one_sb(struct super_block *sb, void *arg)
81{
82 if (!(sb->s_flags & MS_RDONLY) && sb->s_bdi)
83 __sync_filesystem(sb, *(int *)arg);
84}
78/* 85/*
79 * Sync all the data for all the filesystems (called by sys_sync() and 86 * Sync all the data for all the filesystems (called by sys_sync() and
80 * emergency sync) 87 * emergency sync)
81 *
82 * This operation is careful to avoid the livelock which could easily happen
83 * if two or more filesystems are being continuously dirtied. s_need_sync
84 * is used only here. We set it against all filesystems and then clear it as
85 * we sync them. So redirtied filesystems are skipped.
86 *
87 * But if process A is currently running sync_filesystems and then process B
88 * calls sync_filesystems as well, process B will set all the s_need_sync
89 * flags again, which will cause process A to resync everything. Fix that with
90 * a local mutex.
91 */ 88 */
92static void sync_filesystems(int wait) 89static void sync_filesystems(int wait)
93{ 90{
94 struct super_block *sb; 91 iterate_supers(sync_one_sb, &wait);
95 static DEFINE_MUTEX(mutex);
96
97 mutex_lock(&mutex); /* Could be down_interruptible */
98 spin_lock(&sb_lock);
99 list_for_each_entry(sb, &super_blocks, s_list)
100 sb->s_need_sync = 1;
101
102restart:
103 list_for_each_entry(sb, &super_blocks, s_list) {
104 if (!sb->s_need_sync)
105 continue;
106 sb->s_need_sync = 0;
107 sb->s_count++;
108 spin_unlock(&sb_lock);
109
110 down_read(&sb->s_umount);
111 if (!(sb->s_flags & MS_RDONLY) && sb->s_root && sb->s_bdi)
112 __sync_filesystem(sb, wait);
113 up_read(&sb->s_umount);
114
115 /* restart only when sb is no longer on the list */
116 spin_lock(&sb_lock);
117 if (__put_super_and_need_restart(sb))
118 goto restart;
119 }
120 spin_unlock(&sb_lock);
121 mutex_unlock(&mutex);
122} 92}
123 93
124/* 94/*
@@ -160,12 +130,10 @@ void emergency_sync(void)
160 130
161/* 131/*
162 * Generic function to fsync a file. 132 * Generic function to fsync a file.
163 *
164 * filp may be NULL if called via the msync of a vma.
165 */ 133 */
166int file_fsync(struct file *filp, struct dentry *dentry, int datasync) 134int file_fsync(struct file *filp, int datasync)
167{ 135{
168 struct inode * inode = dentry->d_inode; 136 struct inode *inode = filp->f_mapping->host;
169 struct super_block * sb; 137 struct super_block * sb;
170 int ret, err; 138 int ret, err;
171 139
@@ -188,7 +156,6 @@ EXPORT_SYMBOL(file_fsync);
188/** 156/**
189 * vfs_fsync_range - helper to sync a range of data & metadata to disk 157 * vfs_fsync_range - helper to sync a range of data & metadata to disk
190 * @file: file to sync 158 * @file: file to sync
191 * @dentry: dentry of @file
192 * @start: offset in bytes of the beginning of data range to sync 159 * @start: offset in bytes of the beginning of data range to sync
193 * @end: offset in bytes of the end of data range (inclusive) 160 * @end: offset in bytes of the end of data range (inclusive)
194 * @datasync: perform only datasync 161 * @datasync: perform only datasync
@@ -196,32 +163,13 @@ EXPORT_SYMBOL(file_fsync);
196 * Write back data in range @start..@end and metadata for @file to disk. If 163 * Write back data in range @start..@end and metadata for @file to disk. If
197 * @datasync is set only metadata needed to access modified file data is 164 * @datasync is set only metadata needed to access modified file data is
198 * written. 165 * written.
199 *
200 * In case this function is called from nfsd @file may be %NULL and
201 * only @dentry is set. This can only happen when the filesystem
202 * implements the export_operations API.
203 */ 166 */
204int vfs_fsync_range(struct file *file, struct dentry *dentry, loff_t start, 167int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync)
205 loff_t end, int datasync)
206{ 168{
207 const struct file_operations *fop; 169 struct address_space *mapping = file->f_mapping;
208 struct address_space *mapping;
209 int err, ret; 170 int err, ret;
210 171
211 /* 172 if (!file->f_op || !file->f_op->fsync) {
212 * Get mapping and operations from the file in case we have
213 * as file, or get the default values for them in case we
214 * don't have a struct file available. Damn nfsd..
215 */
216 if (file) {
217 mapping = file->f_mapping;
218 fop = file->f_op;
219 } else {
220 mapping = dentry->d_inode->i_mapping;
221 fop = dentry->d_inode->i_fop;
222 }
223
224 if (!fop || !fop->fsync) {
225 ret = -EINVAL; 173 ret = -EINVAL;
226 goto out; 174 goto out;
227 } 175 }
@@ -233,7 +181,7 @@ int vfs_fsync_range(struct file *file, struct dentry *dentry, loff_t start,
233 * livelocks in fsync_buffers_list(). 181 * livelocks in fsync_buffers_list().
234 */ 182 */
235 mutex_lock(&mapping->host->i_mutex); 183 mutex_lock(&mapping->host->i_mutex);
236 err = fop->fsync(file, dentry, datasync); 184 err = file->f_op->fsync(file, datasync);
237 if (!ret) 185 if (!ret)
238 ret = err; 186 ret = err;
239 mutex_unlock(&mapping->host->i_mutex); 187 mutex_unlock(&mapping->host->i_mutex);
@@ -246,19 +194,14 @@ EXPORT_SYMBOL(vfs_fsync_range);
246/** 194/**
247 * vfs_fsync - perform a fsync or fdatasync on a file 195 * vfs_fsync - perform a fsync or fdatasync on a file
248 * @file: file to sync 196 * @file: file to sync
249 * @dentry: dentry of @file
250 * @datasync: only perform a fdatasync operation 197 * @datasync: only perform a fdatasync operation
251 * 198 *
252 * Write back data and metadata for @file to disk. If @datasync is 199 * Write back data and metadata for @file to disk. If @datasync is
253 * set only metadata needed to access modified file data is written. 200 * set only metadata needed to access modified file data is written.
254 *
255 * In case this function is called from nfsd @file may be %NULL and
256 * only @dentry is set. This can only happen when the filesystem
257 * implements the export_operations API.
258 */ 201 */
259int vfs_fsync(struct file *file, struct dentry *dentry, int datasync) 202int vfs_fsync(struct file *file, int datasync)
260{ 203{
261 return vfs_fsync_range(file, dentry, 0, LLONG_MAX, datasync); 204 return vfs_fsync_range(file, 0, LLONG_MAX, datasync);
262} 205}
263EXPORT_SYMBOL(vfs_fsync); 206EXPORT_SYMBOL(vfs_fsync);
264 207
@@ -269,7 +212,7 @@ static int do_fsync(unsigned int fd, int datasync)
269 212
270 file = fget(fd); 213 file = fget(fd);
271 if (file) { 214 if (file) {
272 ret = vfs_fsync(file, file->f_path.dentry, datasync); 215 ret = vfs_fsync(file, datasync);
273 fput(file); 216 fput(file);
274 } 217 }
275 return ret; 218 return ret;
@@ -297,8 +240,7 @@ int generic_write_sync(struct file *file, loff_t pos, loff_t count)
297{ 240{
298 if (!(file->f_flags & O_DSYNC) && !IS_SYNC(file->f_mapping->host)) 241 if (!(file->f_flags & O_DSYNC) && !IS_SYNC(file->f_mapping->host))
299 return 0; 242 return 0;
300 return vfs_fsync_range(file, file->f_path.dentry, pos, 243 return vfs_fsync_range(file, pos, pos + count - 1,
301 pos + count - 1,
302 (file->f_flags & __O_SYNC) ? 0 : 1); 244 (file->f_flags & __O_SYNC) ? 0 : 1);
303} 245}
304EXPORT_SYMBOL(generic_write_sync); 246EXPORT_SYMBOL(generic_write_sync);
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index e9d293593e52..4e321f7353fa 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -46,9 +46,9 @@ struct bin_buffer {
46}; 46};
47 47
48static int 48static int
49fill_read(struct dentry *dentry, char *buffer, loff_t off, size_t count) 49fill_read(struct file *file, char *buffer, loff_t off, size_t count)
50{ 50{
51 struct sysfs_dirent *attr_sd = dentry->d_fsdata; 51 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
52 struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr; 52 struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr;
53 struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; 53 struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
54 int rc; 54 int rc;
@@ -59,7 +59,7 @@ fill_read(struct dentry *dentry, char *buffer, loff_t off, size_t count)
59 59
60 rc = -EIO; 60 rc = -EIO;
61 if (attr->read) 61 if (attr->read)
62 rc = attr->read(kobj, attr, buffer, off, count); 62 rc = attr->read(file, kobj, attr, buffer, off, count);
63 63
64 sysfs_put_active(attr_sd); 64 sysfs_put_active(attr_sd);
65 65
@@ -70,8 +70,7 @@ static ssize_t
70read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off) 70read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off)
71{ 71{
72 struct bin_buffer *bb = file->private_data; 72 struct bin_buffer *bb = file->private_data;
73 struct dentry *dentry = file->f_path.dentry; 73 int size = file->f_path.dentry->d_inode->i_size;
74 int size = dentry->d_inode->i_size;
75 loff_t offs = *off; 74 loff_t offs = *off;
76 int count = min_t(size_t, bytes, PAGE_SIZE); 75 int count = min_t(size_t, bytes, PAGE_SIZE);
77 char *temp; 76 char *temp;
@@ -92,7 +91,7 @@ read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off)
92 91
93 mutex_lock(&bb->mutex); 92 mutex_lock(&bb->mutex);
94 93
95 count = fill_read(dentry, bb->buffer, offs, count); 94 count = fill_read(file, bb->buffer, offs, count);
96 if (count < 0) { 95 if (count < 0) {
97 mutex_unlock(&bb->mutex); 96 mutex_unlock(&bb->mutex);
98 goto out_free; 97 goto out_free;
@@ -117,9 +116,9 @@ read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off)
117} 116}
118 117
119static int 118static int
120flush_write(struct dentry *dentry, char *buffer, loff_t offset, size_t count) 119flush_write(struct file *file, char *buffer, loff_t offset, size_t count)
121{ 120{
122 struct sysfs_dirent *attr_sd = dentry->d_fsdata; 121 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
123 struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr; 122 struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr;
124 struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; 123 struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
125 int rc; 124 int rc;
@@ -130,7 +129,7 @@ flush_write(struct dentry *dentry, char *buffer, loff_t offset, size_t count)
130 129
131 rc = -EIO; 130 rc = -EIO;
132 if (attr->write) 131 if (attr->write)
133 rc = attr->write(kobj, attr, buffer, offset, count); 132 rc = attr->write(file, kobj, attr, buffer, offset, count);
134 133
135 sysfs_put_active(attr_sd); 134 sysfs_put_active(attr_sd);
136 135
@@ -141,8 +140,7 @@ static ssize_t write(struct file *file, const char __user *userbuf,
141 size_t bytes, loff_t *off) 140 size_t bytes, loff_t *off)
142{ 141{
143 struct bin_buffer *bb = file->private_data; 142 struct bin_buffer *bb = file->private_data;
144 struct dentry *dentry = file->f_path.dentry; 143 int size = file->f_path.dentry->d_inode->i_size;
145 int size = dentry->d_inode->i_size;
146 loff_t offs = *off; 144 loff_t offs = *off;
147 int count = min_t(size_t, bytes, PAGE_SIZE); 145 int count = min_t(size_t, bytes, PAGE_SIZE);
148 char *temp; 146 char *temp;
@@ -165,7 +163,7 @@ static ssize_t write(struct file *file, const char __user *userbuf,
165 163
166 memcpy(bb->buffer, temp, count); 164 memcpy(bb->buffer, temp, count);
167 165
168 count = flush_write(dentry, bb->buffer, offs, count); 166 count = flush_write(file, bb->buffer, offs, count);
169 mutex_unlock(&bb->mutex); 167 mutex_unlock(&bb->mutex);
170 168
171 if (count > 0) 169 if (count > 0)
@@ -363,7 +361,7 @@ static int mmap(struct file *file, struct vm_area_struct *vma)
363 if (!attr->mmap) 361 if (!attr->mmap)
364 goto out_put; 362 goto out_put;
365 363
366 rc = attr->mmap(kobj, attr, vma); 364 rc = attr->mmap(file, kobj, attr, vma);
367 if (rc) 365 if (rc)
368 goto out_put; 366 goto out_put;
369 367
@@ -501,7 +499,7 @@ int sysfs_create_bin_file(struct kobject *kobj,
501void sysfs_remove_bin_file(struct kobject *kobj, 499void sysfs_remove_bin_file(struct kobject *kobj,
502 const struct bin_attribute *attr) 500 const struct bin_attribute *attr)
503{ 501{
504 sysfs_hash_and_remove(kobj->sd, attr->attr.name); 502 sysfs_hash_and_remove(kobj->sd, NULL, attr->attr.name);
505} 503}
506 504
507EXPORT_SYMBOL_GPL(sysfs_create_bin_file); 505EXPORT_SYMBOL_GPL(sysfs_create_bin_file);
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 590717861c7a..7e54bac8c4b0 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -380,7 +380,7 @@ int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
380{ 380{
381 struct sysfs_inode_attrs *ps_iattr; 381 struct sysfs_inode_attrs *ps_iattr;
382 382
383 if (sysfs_find_dirent(acxt->parent_sd, sd->s_name)) 383 if (sysfs_find_dirent(acxt->parent_sd, sd->s_ns, sd->s_name))
384 return -EEXIST; 384 return -EEXIST;
385 385
386 sd->s_parent = sysfs_get(acxt->parent_sd); 386 sd->s_parent = sysfs_get(acxt->parent_sd);
@@ -533,13 +533,17 @@ void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt)
533 * Pointer to sysfs_dirent if found, NULL if not. 533 * Pointer to sysfs_dirent if found, NULL if not.
534 */ 534 */
535struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd, 535struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
536 const void *ns,
536 const unsigned char *name) 537 const unsigned char *name)
537{ 538{
538 struct sysfs_dirent *sd; 539 struct sysfs_dirent *sd;
539 540
540 for (sd = parent_sd->s_dir.children; sd; sd = sd->s_sibling) 541 for (sd = parent_sd->s_dir.children; sd; sd = sd->s_sibling) {
542 if (ns && sd->s_ns && (sd->s_ns != ns))
543 continue;
541 if (!strcmp(sd->s_name, name)) 544 if (!strcmp(sd->s_name, name))
542 return sd; 545 return sd;
546 }
543 return NULL; 547 return NULL;
544} 548}
545 549
@@ -558,12 +562,13 @@ struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
558 * Pointer to sysfs_dirent if found, NULL if not. 562 * Pointer to sysfs_dirent if found, NULL if not.
559 */ 563 */
560struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd, 564struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd,
565 const void *ns,
561 const unsigned char *name) 566 const unsigned char *name)
562{ 567{
563 struct sysfs_dirent *sd; 568 struct sysfs_dirent *sd;
564 569
565 mutex_lock(&sysfs_mutex); 570 mutex_lock(&sysfs_mutex);
566 sd = sysfs_find_dirent(parent_sd, name); 571 sd = sysfs_find_dirent(parent_sd, ns, name);
567 sysfs_get(sd); 572 sysfs_get(sd);
568 mutex_unlock(&sysfs_mutex); 573 mutex_unlock(&sysfs_mutex);
569 574
@@ -572,7 +577,8 @@ struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd,
572EXPORT_SYMBOL_GPL(sysfs_get_dirent); 577EXPORT_SYMBOL_GPL(sysfs_get_dirent);
573 578
574static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd, 579static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
575 const char *name, struct sysfs_dirent **p_sd) 580 enum kobj_ns_type type, const void *ns, const char *name,
581 struct sysfs_dirent **p_sd)
576{ 582{
577 umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO; 583 umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO;
578 struct sysfs_addrm_cxt acxt; 584 struct sysfs_addrm_cxt acxt;
@@ -583,6 +589,9 @@ static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
583 sd = sysfs_new_dirent(name, mode, SYSFS_DIR); 589 sd = sysfs_new_dirent(name, mode, SYSFS_DIR);
584 if (!sd) 590 if (!sd)
585 return -ENOMEM; 591 return -ENOMEM;
592
593 sd->s_flags |= (type << SYSFS_NS_TYPE_SHIFT);
594 sd->s_ns = ns;
586 sd->s_dir.kobj = kobj; 595 sd->s_dir.kobj = kobj;
587 596
588 /* link in */ 597 /* link in */
@@ -601,7 +610,33 @@ static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
601int sysfs_create_subdir(struct kobject *kobj, const char *name, 610int sysfs_create_subdir(struct kobject *kobj, const char *name,
602 struct sysfs_dirent **p_sd) 611 struct sysfs_dirent **p_sd)
603{ 612{
604 return create_dir(kobj, kobj->sd, name, p_sd); 613 return create_dir(kobj, kobj->sd,
614 KOBJ_NS_TYPE_NONE, NULL, name, p_sd);
615}
616
617/**
618 * sysfs_read_ns_type: return associated ns_type
619 * @kobj: the kobject being queried
620 *
621 * Each kobject can be tagged with exactly one namespace type
622 * (i.e. network or user). Return the ns_type associated with
623 * this object if any
624 */
625static enum kobj_ns_type sysfs_read_ns_type(struct kobject *kobj)
626{
627 const struct kobj_ns_type_operations *ops;
628 enum kobj_ns_type type;
629
630 ops = kobj_child_ns_ops(kobj);
631 if (!ops)
632 return KOBJ_NS_TYPE_NONE;
633
634 type = ops->type;
635 BUG_ON(type <= KOBJ_NS_TYPE_NONE);
636 BUG_ON(type >= KOBJ_NS_TYPES);
637 BUG_ON(!kobj_ns_type_registered(type));
638
639 return type;
605} 640}
606 641
607/** 642/**
@@ -610,7 +645,9 @@ int sysfs_create_subdir(struct kobject *kobj, const char *name,
610 */ 645 */
611int sysfs_create_dir(struct kobject * kobj) 646int sysfs_create_dir(struct kobject * kobj)
612{ 647{
648 enum kobj_ns_type type;
613 struct sysfs_dirent *parent_sd, *sd; 649 struct sysfs_dirent *parent_sd, *sd;
650 const void *ns = NULL;
614 int error = 0; 651 int error = 0;
615 652
616 BUG_ON(!kobj); 653 BUG_ON(!kobj);
@@ -620,7 +657,11 @@ int sysfs_create_dir(struct kobject * kobj)
620 else 657 else
621 parent_sd = &sysfs_root; 658 parent_sd = &sysfs_root;
622 659
623 error = create_dir(kobj, parent_sd, kobject_name(kobj), &sd); 660 if (sysfs_ns_type(parent_sd))
661 ns = kobj->ktype->namespace(kobj);
662 type = sysfs_read_ns_type(kobj);
663
664 error = create_dir(kobj, parent_sd, type, ns, kobject_name(kobj), &sd);
624 if (!error) 665 if (!error)
625 kobj->sd = sd; 666 kobj->sd = sd;
626 return error; 667 return error;
@@ -630,13 +671,19 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
630 struct nameidata *nd) 671 struct nameidata *nd)
631{ 672{
632 struct dentry *ret = NULL; 673 struct dentry *ret = NULL;
633 struct sysfs_dirent *parent_sd = dentry->d_parent->d_fsdata; 674 struct dentry *parent = dentry->d_parent;
675 struct sysfs_dirent *parent_sd = parent->d_fsdata;
634 struct sysfs_dirent *sd; 676 struct sysfs_dirent *sd;
635 struct inode *inode; 677 struct inode *inode;
678 enum kobj_ns_type type;
679 const void *ns;
636 680
637 mutex_lock(&sysfs_mutex); 681 mutex_lock(&sysfs_mutex);
638 682
639 sd = sysfs_find_dirent(parent_sd, dentry->d_name.name); 683 type = sysfs_ns_type(parent_sd);
684 ns = sysfs_info(dir->i_sb)->ns[type];
685
686 sd = sysfs_find_dirent(parent_sd, ns, dentry->d_name.name);
640 687
641 /* no such entry */ 688 /* no such entry */
642 if (!sd) { 689 if (!sd) {
@@ -735,7 +782,8 @@ void sysfs_remove_dir(struct kobject * kobj)
735} 782}
736 783
737int sysfs_rename(struct sysfs_dirent *sd, 784int sysfs_rename(struct sysfs_dirent *sd,
738 struct sysfs_dirent *new_parent_sd, const char *new_name) 785 struct sysfs_dirent *new_parent_sd, const void *new_ns,
786 const char *new_name)
739{ 787{
740 const char *dup_name = NULL; 788 const char *dup_name = NULL;
741 int error; 789 int error;
@@ -743,12 +791,12 @@ int sysfs_rename(struct sysfs_dirent *sd,
743 mutex_lock(&sysfs_mutex); 791 mutex_lock(&sysfs_mutex);
744 792
745 error = 0; 793 error = 0;
746 if ((sd->s_parent == new_parent_sd) && 794 if ((sd->s_parent == new_parent_sd) && (sd->s_ns == new_ns) &&
747 (strcmp(sd->s_name, new_name) == 0)) 795 (strcmp(sd->s_name, new_name) == 0))
748 goto out; /* nothing to rename */ 796 goto out; /* nothing to rename */
749 797
750 error = -EEXIST; 798 error = -EEXIST;
751 if (sysfs_find_dirent(new_parent_sd, new_name)) 799 if (sysfs_find_dirent(new_parent_sd, new_ns, new_name))
752 goto out; 800 goto out;
753 801
754 /* rename sysfs_dirent */ 802 /* rename sysfs_dirent */
@@ -770,6 +818,7 @@ int sysfs_rename(struct sysfs_dirent *sd,
770 sd->s_parent = new_parent_sd; 818 sd->s_parent = new_parent_sd;
771 sysfs_link_sibling(sd); 819 sysfs_link_sibling(sd);
772 } 820 }
821 sd->s_ns = new_ns;
773 822
774 error = 0; 823 error = 0;
775 out: 824 out:
@@ -780,19 +829,28 @@ int sysfs_rename(struct sysfs_dirent *sd,
780 829
781int sysfs_rename_dir(struct kobject *kobj, const char *new_name) 830int sysfs_rename_dir(struct kobject *kobj, const char *new_name)
782{ 831{
783 return sysfs_rename(kobj->sd, kobj->sd->s_parent, new_name); 832 struct sysfs_dirent *parent_sd = kobj->sd->s_parent;
833 const void *new_ns = NULL;
834
835 if (sysfs_ns_type(parent_sd))
836 new_ns = kobj->ktype->namespace(kobj);
837
838 return sysfs_rename(kobj->sd, parent_sd, new_ns, new_name);
784} 839}
785 840
786int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj) 841int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj)
787{ 842{
788 struct sysfs_dirent *sd = kobj->sd; 843 struct sysfs_dirent *sd = kobj->sd;
789 struct sysfs_dirent *new_parent_sd; 844 struct sysfs_dirent *new_parent_sd;
845 const void *new_ns = NULL;
790 846
791 BUG_ON(!sd->s_parent); 847 BUG_ON(!sd->s_parent);
848 if (sysfs_ns_type(sd->s_parent))
849 new_ns = kobj->ktype->namespace(kobj);
792 new_parent_sd = new_parent_kobj && new_parent_kobj->sd ? 850 new_parent_sd = new_parent_kobj && new_parent_kobj->sd ?
793 new_parent_kobj->sd : &sysfs_root; 851 new_parent_kobj->sd : &sysfs_root;
794 852
795 return sysfs_rename(sd, new_parent_sd, sd->s_name); 853 return sysfs_rename(sd, new_parent_sd, new_ns, sd->s_name);
796} 854}
797 855
798/* Relationship between s_mode and the DT_xxx types */ 856/* Relationship between s_mode and the DT_xxx types */
@@ -807,32 +865,35 @@ static int sysfs_dir_release(struct inode *inode, struct file *filp)
807 return 0; 865 return 0;
808} 866}
809 867
810static struct sysfs_dirent *sysfs_dir_pos(struct sysfs_dirent *parent_sd, 868static struct sysfs_dirent *sysfs_dir_pos(const void *ns,
811 ino_t ino, struct sysfs_dirent *pos) 869 struct sysfs_dirent *parent_sd, ino_t ino, struct sysfs_dirent *pos)
812{ 870{
813 if (pos) { 871 if (pos) {
814 int valid = !(pos->s_flags & SYSFS_FLAG_REMOVED) && 872 int valid = !(pos->s_flags & SYSFS_FLAG_REMOVED) &&
815 pos->s_parent == parent_sd && 873 pos->s_parent == parent_sd &&
816 ino == pos->s_ino; 874 ino == pos->s_ino;
817 sysfs_put(pos); 875 sysfs_put(pos);
818 if (valid) 876 if (!valid)
819 return pos; 877 pos = NULL;
820 } 878 }
821 pos = NULL; 879 if (!pos && (ino > 1) && (ino < INT_MAX)) {
822 if ((ino > 1) && (ino < INT_MAX)) {
823 pos = parent_sd->s_dir.children; 880 pos = parent_sd->s_dir.children;
824 while (pos && (ino > pos->s_ino)) 881 while (pos && (ino > pos->s_ino))
825 pos = pos->s_sibling; 882 pos = pos->s_sibling;
826 } 883 }
884 while (pos && pos->s_ns && pos->s_ns != ns)
885 pos = pos->s_sibling;
827 return pos; 886 return pos;
828} 887}
829 888
830static struct sysfs_dirent *sysfs_dir_next_pos(struct sysfs_dirent *parent_sd, 889static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns,
831 ino_t ino, struct sysfs_dirent *pos) 890 struct sysfs_dirent *parent_sd, ino_t ino, struct sysfs_dirent *pos)
832{ 891{
833 pos = sysfs_dir_pos(parent_sd, ino, pos); 892 pos = sysfs_dir_pos(ns, parent_sd, ino, pos);
834 if (pos) 893 if (pos)
835 pos = pos->s_sibling; 894 pos = pos->s_sibling;
895 while (pos && pos->s_ns && pos->s_ns != ns)
896 pos = pos->s_sibling;
836 return pos; 897 return pos;
837} 898}
838 899
@@ -841,8 +902,13 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
841 struct dentry *dentry = filp->f_path.dentry; 902 struct dentry *dentry = filp->f_path.dentry;
842 struct sysfs_dirent * parent_sd = dentry->d_fsdata; 903 struct sysfs_dirent * parent_sd = dentry->d_fsdata;
843 struct sysfs_dirent *pos = filp->private_data; 904 struct sysfs_dirent *pos = filp->private_data;
905 enum kobj_ns_type type;
906 const void *ns;
844 ino_t ino; 907 ino_t ino;
845 908
909 type = sysfs_ns_type(parent_sd);
910 ns = sysfs_info(dentry->d_sb)->ns[type];
911
846 if (filp->f_pos == 0) { 912 if (filp->f_pos == 0) {
847 ino = parent_sd->s_ino; 913 ino = parent_sd->s_ino;
848 if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) == 0) 914 if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) == 0)
@@ -857,9 +923,9 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
857 filp->f_pos++; 923 filp->f_pos++;
858 } 924 }
859 mutex_lock(&sysfs_mutex); 925 mutex_lock(&sysfs_mutex);
860 for (pos = sysfs_dir_pos(parent_sd, filp->f_pos, pos); 926 for (pos = sysfs_dir_pos(ns, parent_sd, filp->f_pos, pos);
861 pos; 927 pos;
862 pos = sysfs_dir_next_pos(parent_sd, filp->f_pos, pos)) { 928 pos = sysfs_dir_next_pos(ns, parent_sd, filp->f_pos, pos)) {
863 const char * name; 929 const char * name;
864 unsigned int type; 930 unsigned int type;
865 int len, ret; 931 int len, ret;
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index e222b2582746..1beaa739d0a6 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -478,9 +478,12 @@ void sysfs_notify(struct kobject *k, const char *dir, const char *attr)
478 mutex_lock(&sysfs_mutex); 478 mutex_lock(&sysfs_mutex);
479 479
480 if (sd && dir) 480 if (sd && dir)
481 sd = sysfs_find_dirent(sd, dir); 481 /* Only directories are tagged, so no need to pass
482 * a tag explicitly.
483 */
484 sd = sysfs_find_dirent(sd, NULL, dir);
482 if (sd && attr) 485 if (sd && attr)
483 sd = sysfs_find_dirent(sd, attr); 486 sd = sysfs_find_dirent(sd, NULL, attr);
484 if (sd) 487 if (sd)
485 sysfs_notify_dirent(sd); 488 sysfs_notify_dirent(sd);
486 489
@@ -569,7 +572,7 @@ int sysfs_add_file_to_group(struct kobject *kobj,
569 int error; 572 int error;
570 573
571 if (group) 574 if (group)
572 dir_sd = sysfs_get_dirent(kobj->sd, group); 575 dir_sd = sysfs_get_dirent(kobj->sd, NULL, group);
573 else 576 else
574 dir_sd = sysfs_get(kobj->sd); 577 dir_sd = sysfs_get(kobj->sd);
575 578
@@ -599,7 +602,7 @@ int sysfs_chmod_file(struct kobject *kobj, struct attribute *attr, mode_t mode)
599 mutex_lock(&sysfs_mutex); 602 mutex_lock(&sysfs_mutex);
600 603
601 rc = -ENOENT; 604 rc = -ENOENT;
602 sd = sysfs_find_dirent(kobj->sd, attr->name); 605 sd = sysfs_find_dirent(kobj->sd, NULL, attr->name);
603 if (!sd) 606 if (!sd)
604 goto out; 607 goto out;
605 608
@@ -624,7 +627,7 @@ EXPORT_SYMBOL_GPL(sysfs_chmod_file);
624 627
625void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr) 628void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr)
626{ 629{
627 sysfs_hash_and_remove(kobj->sd, attr->name); 630 sysfs_hash_and_remove(kobj->sd, NULL, attr->name);
628} 631}
629 632
630void sysfs_remove_files(struct kobject * kobj, const struct attribute **ptr) 633void sysfs_remove_files(struct kobject * kobj, const struct attribute **ptr)
@@ -646,11 +649,11 @@ void sysfs_remove_file_from_group(struct kobject *kobj,
646 struct sysfs_dirent *dir_sd; 649 struct sysfs_dirent *dir_sd;
647 650
648 if (group) 651 if (group)
649 dir_sd = sysfs_get_dirent(kobj->sd, group); 652 dir_sd = sysfs_get_dirent(kobj->sd, NULL, group);
650 else 653 else
651 dir_sd = sysfs_get(kobj->sd); 654 dir_sd = sysfs_get(kobj->sd);
652 if (dir_sd) { 655 if (dir_sd) {
653 sysfs_hash_and_remove(dir_sd, attr->name); 656 sysfs_hash_and_remove(dir_sd, NULL, attr->name);
654 sysfs_put(dir_sd); 657 sysfs_put(dir_sd);
655 } 658 }
656} 659}
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index fe611949a7f7..23c1e598792a 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -23,7 +23,7 @@ static void remove_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
23 int i; 23 int i;
24 24
25 for (i = 0, attr = grp->attrs; *attr; i++, attr++) 25 for (i = 0, attr = grp->attrs; *attr; i++, attr++)
26 sysfs_hash_and_remove(dir_sd, (*attr)->name); 26 sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
27} 27}
28 28
29static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj, 29static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
@@ -39,7 +39,7 @@ static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
39 * visibility. Do this by first removing then 39 * visibility. Do this by first removing then
40 * re-adding (if required) the file */ 40 * re-adding (if required) the file */
41 if (update) 41 if (update)
42 sysfs_hash_and_remove(dir_sd, (*attr)->name); 42 sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
43 if (grp->is_visible) { 43 if (grp->is_visible) {
44 mode = grp->is_visible(kobj, *attr, i); 44 mode = grp->is_visible(kobj, *attr, i);
45 if (!mode) 45 if (!mode)
@@ -132,7 +132,7 @@ void sysfs_remove_group(struct kobject * kobj,
132 struct sysfs_dirent *sd; 132 struct sysfs_dirent *sd;
133 133
134 if (grp->name) { 134 if (grp->name) {
135 sd = sysfs_get_dirent(dir_sd, grp->name); 135 sd = sysfs_get_dirent(dir_sd, NULL, grp->name);
136 if (!sd) { 136 if (!sd) {
137 WARN(!sd, KERN_WARNING "sysfs group %p not found for " 137 WARN(!sd, KERN_WARNING "sysfs group %p not found for "
138 "kobject '%s'\n", grp, kobject_name(kobj)); 138 "kobject '%s'\n", grp, kobject_name(kobj));
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 082daaecac1b..bde1a4c3679a 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -18,6 +18,7 @@
18#include <linux/capability.h> 18#include <linux/capability.h>
19#include <linux/errno.h> 19#include <linux/errno.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/slab.h>
21#include <linux/xattr.h> 22#include <linux/xattr.h>
22#include <linux/security.h> 23#include <linux/security.h>
23#include "sysfs.h" 24#include "sysfs.h"
@@ -116,13 +117,11 @@ int sysfs_setattr(struct dentry *dentry, struct iattr *iattr)
116 if (error) 117 if (error)
117 goto out; 118 goto out;
118 119
119 iattr->ia_valid &= ~ATTR_SIZE; /* ignore size changes */ 120 /* this ignores size changes */
120 121 generic_setattr(inode, iattr);
121 error = inode_setattr(inode, iattr);
122 if (error)
123 goto out;
124 122
125 error = sysfs_sd_setattr(sd, iattr); 123 error = sysfs_sd_setattr(sd, iattr);
124
126out: 125out:
127 mutex_unlock(&sysfs_mutex); 126 mutex_unlock(&sysfs_mutex);
128 return error; 127 return error;
@@ -323,7 +322,7 @@ void sysfs_delete_inode(struct inode *inode)
323 sysfs_put(sd); 322 sysfs_put(sd);
324} 323}
325 324
326int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name) 325int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const char *name)
327{ 326{
328 struct sysfs_addrm_cxt acxt; 327 struct sysfs_addrm_cxt acxt;
329 struct sysfs_dirent *sd; 328 struct sysfs_dirent *sd;
@@ -333,7 +332,9 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name)
333 332
334 sysfs_addrm_start(&acxt, dir_sd); 333 sysfs_addrm_start(&acxt, dir_sd);
335 334
336 sd = sysfs_find_dirent(dir_sd, name); 335 sd = sysfs_find_dirent(dir_sd, ns, name);
336 if (sd && (sd->s_ns != ns))
337 sd = NULL;
337 if (sd) 338 if (sd)
338 sysfs_remove_one(&acxt, sd); 339 sysfs_remove_one(&acxt, sd);
339 340
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 0cb10884a2fc..281c0c9bc39f 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -18,6 +18,7 @@
18#include <linux/init.h> 18#include <linux/init.h>
19#include <linux/module.h> 19#include <linux/module.h>
20#include <linux/magic.h> 20#include <linux/magic.h>
21#include <linux/slab.h>
21 22
22#include "sysfs.h" 23#include "sysfs.h"
23 24
@@ -34,7 +35,7 @@ static const struct super_operations sysfs_ops = {
34struct sysfs_dirent sysfs_root = { 35struct sysfs_dirent sysfs_root = {
35 .s_name = "", 36 .s_name = "",
36 .s_count = ATOMIC_INIT(1), 37 .s_count = ATOMIC_INIT(1),
37 .s_flags = SYSFS_DIR, 38 .s_flags = SYSFS_DIR | (KOBJ_NS_TYPE_NONE << SYSFS_NS_TYPE_SHIFT),
38 .s_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO, 39 .s_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO,
39 .s_ino = 1, 40 .s_ino = 1,
40}; 41};
@@ -71,18 +72,107 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent)
71 return 0; 72 return 0;
72} 73}
73 74
75static int sysfs_test_super(struct super_block *sb, void *data)
76{
77 struct sysfs_super_info *sb_info = sysfs_info(sb);
78 struct sysfs_super_info *info = data;
79 enum kobj_ns_type type;
80 int found = 1;
81
82 for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) {
83 if (sb_info->ns[type] != info->ns[type])
84 found = 0;
85 }
86 return found;
87}
88
89static int sysfs_set_super(struct super_block *sb, void *data)
90{
91 int error;
92 error = set_anon_super(sb, data);
93 if (!error)
94 sb->s_fs_info = data;
95 return error;
96}
97
74static int sysfs_get_sb(struct file_system_type *fs_type, 98static int sysfs_get_sb(struct file_system_type *fs_type,
75 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 99 int flags, const char *dev_name, void *data, struct vfsmount *mnt)
76{ 100{
77 return get_sb_single(fs_type, flags, data, sysfs_fill_super, mnt); 101 struct sysfs_super_info *info;
102 enum kobj_ns_type type;
103 struct super_block *sb;
104 int error;
105
106 error = -ENOMEM;
107 info = kzalloc(sizeof(*info), GFP_KERNEL);
108 if (!info)
109 goto out;
110
111 for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++)
112 info->ns[type] = kobj_ns_current(type);
113
114 sb = sget(fs_type, sysfs_test_super, sysfs_set_super, info);
115 if (IS_ERR(sb) || sb->s_fs_info != info)
116 kfree(info);
117 if (IS_ERR(sb)) {
118 error = PTR_ERR(sb);
119 goto out;
120 }
121 if (!sb->s_root) {
122 sb->s_flags = flags;
123 error = sysfs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
124 if (error) {
125 deactivate_locked_super(sb);
126 goto out;
127 }
128 sb->s_flags |= MS_ACTIVE;
129 }
130
131 simple_set_mnt(mnt, sb);
132 error = 0;
133out:
134 return error;
135}
136
137static void sysfs_kill_sb(struct super_block *sb)
138{
139 struct sysfs_super_info *info = sysfs_info(sb);
140
141 /* Remove the superblock from fs_supers/s_instances
142 * so we can't find it, before freeing sysfs_super_info.
143 */
144 kill_anon_super(sb);
145 kfree(info);
78} 146}
79 147
80static struct file_system_type sysfs_fs_type = { 148static struct file_system_type sysfs_fs_type = {
81 .name = "sysfs", 149 .name = "sysfs",
82 .get_sb = sysfs_get_sb, 150 .get_sb = sysfs_get_sb,
83 .kill_sb = kill_anon_super, 151 .kill_sb = sysfs_kill_sb,
84}; 152};
85 153
154void sysfs_exit_ns(enum kobj_ns_type type, const void *ns)
155{
156 struct super_block *sb;
157
158 mutex_lock(&sysfs_mutex);
159 spin_lock(&sb_lock);
160 list_for_each_entry(sb, &sysfs_fs_type.fs_supers, s_instances) {
161 struct sysfs_super_info *info = sysfs_info(sb);
162 /*
163 * If we see a superblock on the fs_supers/s_instances
164 * list the unmount has not completed and sb->s_fs_info
165 * points to a valid struct sysfs_super_info.
166 */
167 /* Ignore superblocks with the wrong ns */
168 if (info->ns[type] != ns)
169 continue;
170 info->ns[type] = NULL;
171 }
172 spin_unlock(&sb_lock);
173 mutex_unlock(&sysfs_mutex);
174}
175
86int __init sysfs_init(void) 176int __init sysfs_init(void)
87{ 177{
88 int err = -ENOMEM; 178 int err = -ENOMEM;
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 1b9a3a1e8a17..f71246bebfe4 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -11,6 +11,7 @@
11 */ 11 */
12 12
13#include <linux/fs.h> 13#include <linux/fs.h>
14#include <linux/gfp.h>
14#include <linux/mount.h> 15#include <linux/mount.h>
15#include <linux/module.h> 16#include <linux/module.h>
16#include <linux/kobject.h> 17#include <linux/kobject.h>
@@ -57,6 +58,8 @@ static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target,
57 if (!sd) 58 if (!sd)
58 goto out_put; 59 goto out_put;
59 60
61 if (sysfs_ns_type(parent_sd))
62 sd->s_ns = target->ktype->namespace(target);
60 sd->s_symlink.target_sd = target_sd; 63 sd->s_symlink.target_sd = target_sd;
61 target_sd = NULL; /* reference is now owned by the symlink */ 64 target_sd = NULL; /* reference is now owned by the symlink */
62 65
@@ -106,6 +109,26 @@ int sysfs_create_link_nowarn(struct kobject *kobj, struct kobject *target,
106} 109}
107 110
108/** 111/**
112 * sysfs_delete_link - remove symlink in object's directory.
113 * @kobj: object we're acting for.
114 * @targ: object we're pointing to.
115 * @name: name of the symlink to remove.
116 *
117 * Unlike sysfs_remove_link sysfs_delete_link has enough information
118 * to successfully delete symlinks in tagged directories.
119 */
120void sysfs_delete_link(struct kobject *kobj, struct kobject *targ,
121 const char *name)
122{
123 const void *ns = NULL;
124 spin_lock(&sysfs_assoc_lock);
125 if (targ->sd)
126 ns = targ->sd->s_ns;
127 spin_unlock(&sysfs_assoc_lock);
128 sysfs_hash_and_remove(kobj->sd, ns, name);
129}
130
131/**
109 * sysfs_remove_link - remove symlink in object's directory. 132 * sysfs_remove_link - remove symlink in object's directory.
110 * @kobj: object we're acting for. 133 * @kobj: object we're acting for.
111 * @name: name of the symlink to remove. 134 * @name: name of the symlink to remove.
@@ -120,7 +143,7 @@ void sysfs_remove_link(struct kobject * kobj, const char * name)
120 else 143 else
121 parent_sd = kobj->sd; 144 parent_sd = kobj->sd;
122 145
123 sysfs_hash_and_remove(parent_sd, name); 146 sysfs_hash_and_remove(parent_sd, NULL, name);
124} 147}
125 148
126/** 149/**
@@ -136,6 +159,7 @@ int sysfs_rename_link(struct kobject *kobj, struct kobject *targ,
136 const char *old, const char *new) 159 const char *old, const char *new)
137{ 160{
138 struct sysfs_dirent *parent_sd, *sd = NULL; 161 struct sysfs_dirent *parent_sd, *sd = NULL;
162 const void *old_ns = NULL, *new_ns = NULL;
139 int result; 163 int result;
140 164
141 if (!kobj) 165 if (!kobj)
@@ -143,8 +167,11 @@ int sysfs_rename_link(struct kobject *kobj, struct kobject *targ,
143 else 167 else
144 parent_sd = kobj->sd; 168 parent_sd = kobj->sd;
145 169
170 if (targ->sd)
171 old_ns = targ->sd->s_ns;
172
146 result = -ENOENT; 173 result = -ENOENT;
147 sd = sysfs_get_dirent(parent_sd, old); 174 sd = sysfs_get_dirent(parent_sd, old_ns, old);
148 if (!sd) 175 if (!sd)
149 goto out; 176 goto out;
150 177
@@ -154,7 +181,10 @@ int sysfs_rename_link(struct kobject *kobj, struct kobject *targ,
154 if (sd->s_symlink.target_sd->s_dir.kobj != targ) 181 if (sd->s_symlink.target_sd->s_dir.kobj != targ)
155 goto out; 182 goto out;
156 183
157 result = sysfs_rename(sd, parent_sd, new); 184 if (sysfs_ns_type(parent_sd))
185 new_ns = targ->ktype->namespace(targ);
186
187 result = sysfs_rename(sd, parent_sd, new_ns, new);
158 188
159out: 189out:
160 sysfs_put(sd); 190 sysfs_put(sd);
@@ -260,3 +290,4 @@ const struct inode_operations sysfs_symlink_inode_operations = {
260 290
261EXPORT_SYMBOL_GPL(sysfs_create_link); 291EXPORT_SYMBOL_GPL(sysfs_create_link);
262EXPORT_SYMBOL_GPL(sysfs_remove_link); 292EXPORT_SYMBOL_GPL(sysfs_remove_link);
293EXPORT_SYMBOL_GPL(sysfs_rename_link);
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 30f5a44fb5d3..6a13105b5594 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -58,6 +58,7 @@ struct sysfs_dirent {
58 struct sysfs_dirent *s_sibling; 58 struct sysfs_dirent *s_sibling;
59 const char *s_name; 59 const char *s_name;
60 60
61 const void *s_ns; /* namespace tag */
61 union { 62 union {
62 struct sysfs_elem_dir s_dir; 63 struct sysfs_elem_dir s_dir;
63 struct sysfs_elem_symlink s_symlink; 64 struct sysfs_elem_symlink s_symlink;
@@ -81,14 +82,27 @@ struct sysfs_dirent {
81#define SYSFS_COPY_NAME (SYSFS_DIR | SYSFS_KOBJ_LINK) 82#define SYSFS_COPY_NAME (SYSFS_DIR | SYSFS_KOBJ_LINK)
82#define SYSFS_ACTIVE_REF (SYSFS_KOBJ_ATTR | SYSFS_KOBJ_BIN_ATTR) 83#define SYSFS_ACTIVE_REF (SYSFS_KOBJ_ATTR | SYSFS_KOBJ_BIN_ATTR)
83 84
84#define SYSFS_FLAG_MASK ~SYSFS_TYPE_MASK 85/* identify any namespace tag on sysfs_dirents */
85#define SYSFS_FLAG_REMOVED 0x0200 86#define SYSFS_NS_TYPE_MASK 0xff00
87#define SYSFS_NS_TYPE_SHIFT 8
88
89#define SYSFS_FLAG_MASK ~(SYSFS_NS_TYPE_MASK|SYSFS_TYPE_MASK)
90#define SYSFS_FLAG_REMOVED 0x020000
86 91
87static inline unsigned int sysfs_type(struct sysfs_dirent *sd) 92static inline unsigned int sysfs_type(struct sysfs_dirent *sd)
88{ 93{
89 return sd->s_flags & SYSFS_TYPE_MASK; 94 return sd->s_flags & SYSFS_TYPE_MASK;
90} 95}
91 96
97/*
98 * Return any namespace tags on this dirent.
99 * enum kobj_ns_type is defined in linux/kobject.h
100 */
101static inline enum kobj_ns_type sysfs_ns_type(struct sysfs_dirent *sd)
102{
103 return (sd->s_flags & SYSFS_NS_TYPE_MASK) >> SYSFS_NS_TYPE_SHIFT;
104}
105
92#ifdef CONFIG_DEBUG_LOCK_ALLOC 106#ifdef CONFIG_DEBUG_LOCK_ALLOC
93#define sysfs_dirent_init_lockdep(sd) \ 107#define sysfs_dirent_init_lockdep(sd) \
94do { \ 108do { \
@@ -114,6 +128,16 @@ struct sysfs_addrm_cxt {
114/* 128/*
115 * mount.c 129 * mount.c
116 */ 130 */
131
132/*
133 * Each sb is associated with a set of namespace tags (i.e.
134 * the network namespace of the task which mounted this sysfs
135 * instance).
136 */
137struct sysfs_super_info {
138 const void *ns[KOBJ_NS_TYPES];
139};
140#define sysfs_info(SB) ((struct sysfs_super_info *)(SB->s_fs_info))
117extern struct sysfs_dirent sysfs_root; 141extern struct sysfs_dirent sysfs_root;
118extern struct kmem_cache *sysfs_dir_cachep; 142extern struct kmem_cache *sysfs_dir_cachep;
119 143
@@ -137,8 +161,10 @@ void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd);
137void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt); 161void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt);
138 162
139struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd, 163struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
164 const void *ns,
140 const unsigned char *name); 165 const unsigned char *name);
141struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd, 166struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd,
167 const void *ns,
142 const unsigned char *name); 168 const unsigned char *name);
143struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type); 169struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type);
144 170
@@ -149,7 +175,7 @@ int sysfs_create_subdir(struct kobject *kobj, const char *name,
149void sysfs_remove_subdir(struct sysfs_dirent *sd); 175void sysfs_remove_subdir(struct sysfs_dirent *sd);
150 176
151int sysfs_rename(struct sysfs_dirent *sd, 177int sysfs_rename(struct sysfs_dirent *sd,
152 struct sysfs_dirent *new_parent_sd, const char *new_name); 178 struct sysfs_dirent *new_parent_sd, const void *ns, const char *new_name);
153 179
154static inline struct sysfs_dirent *__sysfs_get(struct sysfs_dirent *sd) 180static inline struct sysfs_dirent *__sysfs_get(struct sysfs_dirent *sd)
155{ 181{
@@ -179,7 +205,7 @@ int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
179int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); 205int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat);
180int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value, 206int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
181 size_t size, int flags); 207 size_t size, int flags);
182int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name); 208int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const char *name);
183int sysfs_inode_init(void); 209int sysfs_inode_init(void);
184 210
185/* 211/*
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index 4e50286a4cc3..79941e4964a4 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -24,7 +24,7 @@ const struct file_operations sysv_dir_operations = {
24 .llseek = generic_file_llseek, 24 .llseek = generic_file_llseek,
25 .read = generic_read_dir, 25 .read = generic_read_dir,
26 .readdir = sysv_readdir, 26 .readdir = sysv_readdir,
27 .fsync = simple_fsync, 27 .fsync = generic_file_fsync,
28}; 28};
29 29
30static inline void dir_put_page(struct page *page) 30static inline void dir_put_page(struct page *page)
@@ -164,8 +164,8 @@ struct sysv_dir_entry *sysv_find_entry(struct dentry *dentry, struct page **res_
164 name, de->name)) 164 name, de->name))
165 goto found; 165 goto found;
166 } 166 }
167 dir_put_page(page);
167 } 168 }
168 dir_put_page(page);
169 169
170 if (++n >= npages) 170 if (++n >= npages)
171 n = 0; 171 n = 0;
diff --git a/fs/sysv/file.c b/fs/sysv/file.c
index 96340c01f4a7..750cc22349bd 100644
--- a/fs/sysv/file.c
+++ b/fs/sysv/file.c
@@ -26,7 +26,7 @@ const struct file_operations sysv_file_operations = {
26 .write = do_sync_write, 26 .write = do_sync_write,
27 .aio_write = generic_file_aio_write, 27 .aio_write = generic_file_aio_write,
28 .mmap = generic_file_mmap, 28 .mmap = generic_file_mmap,
29 .fsync = simple_fsync, 29 .fsync = generic_file_fsync,
30 .splice_read = generic_file_splice_read, 30 .splice_read = generic_file_splice_read,
31}; 31};
32 32
diff --git a/fs/sysv/ialloc.c b/fs/sysv/ialloc.c
index 241e9765cfad..bbd69bdb0fa8 100644
--- a/fs/sysv/ialloc.c
+++ b/fs/sysv/ialloc.c
@@ -159,15 +159,7 @@ struct inode * sysv_new_inode(const struct inode * dir, mode_t mode)
159 *sbi->s_sb_fic_count = cpu_to_fs16(sbi, count); 159 *sbi->s_sb_fic_count = cpu_to_fs16(sbi, count);
160 fs16_add(sbi, sbi->s_sb_total_free_inodes, -1); 160 fs16_add(sbi, sbi->s_sb_total_free_inodes, -1);
161 dirty_sb(sb); 161 dirty_sb(sb);
162 162 inode_init_owner(inode, dir, mode);
163 if (dir->i_mode & S_ISGID) {
164 inode->i_gid = dir->i_gid;
165 if (S_ISDIR(mode))
166 mode |= S_ISGID;
167 } else
168 inode->i_gid = current_fsgid();
169
170 inode->i_uid = current_fsuid();
171 inode->i_ino = fs16_to_cpu(sbi, ino); 163 inode->i_ino = fs16_to_cpu(sbi, ino);
172 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; 164 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
173 inode->i_blocks = 0; 165 inode->i_blocks = 0;
@@ -176,7 +168,6 @@ struct inode * sysv_new_inode(const struct inode * dir, mode_t mode)
176 insert_inode_hash(inode); 168 insert_inode_hash(inode);
177 mark_inode_dirty(inode); 169 mark_inode_dirty(inode);
178 170
179 inode->i_mode = mode; /* for sysv_write_inode() */
180 sysv_write_inode(inode, 0); /* ensure inode not allocated again */ 171 sysv_write_inode(inode, 0); /* ensure inode not allocated again */
181 mark_inode_dirty(inode); /* cleared by sysv_write_inode() */ 172 mark_inode_dirty(inode); /* cleared by sysv_write_inode() */
182 /* That's it. */ 173 /* That's it. */
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 4573734d723d..d4a5380b5669 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -43,6 +43,7 @@ static int sysv_sync_fs(struct super_block *sb, int wait)
43 * then attach current time stamp. 43 * then attach current time stamp.
44 * But if the filesystem was marked clean, keep it clean. 44 * But if the filesystem was marked clean, keep it clean.
45 */ 45 */
46 sb->s_dirt = 0;
46 old_time = fs32_to_cpu(sbi, *sbi->s_sb_time); 47 old_time = fs32_to_cpu(sbi, *sbi->s_sb_time);
47 if (sbi->s_type == FSTYPE_SYSV4) { 48 if (sbi->s_type == FSTYPE_SYSV4) {
48 if (*sbi->s_sb_state == cpu_to_fs32(sbi, 0x7c269d38 - old_time)) 49 if (*sbi->s_sb_state == cpu_to_fs32(sbi, 0x7c269d38 - old_time))
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 1bfc95ad5f71..b86ab8eff79a 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -14,6 +14,7 @@
14#include <linux/fs.h> 14#include <linux/fs.h>
15#include <linux/sched.h> 15#include <linux/sched.h>
16#include <linux/kernel.h> 16#include <linux/kernel.h>
17#include <linux/slab.h>
17#include <linux/list.h> 18#include <linux/list.h>
18#include <linux/spinlock.h> 19#include <linux/spinlock.h>
19#include <linux/time.h> 20#include <linux/time.h>
@@ -109,31 +110,14 @@ static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count,
109 struct timerfd_ctx *ctx = file->private_data; 110 struct timerfd_ctx *ctx = file->private_data;
110 ssize_t res; 111 ssize_t res;
111 u64 ticks = 0; 112 u64 ticks = 0;
112 DECLARE_WAITQUEUE(wait, current);
113 113
114 if (count < sizeof(ticks)) 114 if (count < sizeof(ticks))
115 return -EINVAL; 115 return -EINVAL;
116 spin_lock_irq(&ctx->wqh.lock); 116 spin_lock_irq(&ctx->wqh.lock);
117 res = -EAGAIN; 117 if (file->f_flags & O_NONBLOCK)
118 if (!ctx->ticks && !(file->f_flags & O_NONBLOCK)) { 118 res = -EAGAIN;
119 __add_wait_queue(&ctx->wqh, &wait); 119 else
120 for (res = 0;;) { 120 res = wait_event_interruptible_locked_irq(ctx->wqh, ctx->ticks);
121 set_current_state(TASK_INTERRUPTIBLE);
122 if (ctx->ticks) {
123 res = 0;
124 break;
125 }
126 if (signal_pending(current)) {
127 res = -ERESTARTSYS;
128 break;
129 }
130 spin_unlock_irq(&ctx->wqh.lock);
131 schedule();
132 spin_lock_irq(&ctx->wqh.lock);
133 }
134 __remove_wait_queue(&ctx->wqh, &wait);
135 __set_current_state(TASK_RUNNING);
136 }
137 if (ctx->ticks) { 121 if (ctx->ticks) {
138 ticks = ctx->ticks; 122 ticks = ctx->ticks;
139 if (ctx->expired && ctx->tintv.tv64) { 123 if (ctx->expired && ctx->tintv.tv64) {
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index 4775af401167..37fa7ed062d8 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -45,6 +45,7 @@
45 45
46#include <linux/freezer.h> 46#include <linux/freezer.h>
47#include <linux/kthread.h> 47#include <linux/kthread.h>
48#include <linux/slab.h>
48#include "ubifs.h" 49#include "ubifs.h"
49 50
50/** 51/**
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 90492327b383..c2a68baa782f 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -34,6 +34,7 @@
34#include <linux/moduleparam.h> 34#include <linux/moduleparam.h>
35#include <linux/debugfs.h> 35#include <linux/debugfs.h>
36#include <linux/math64.h> 36#include <linux/math64.h>
37#include <linux/slab.h>
37 38
38#ifdef CONFIG_UBIFS_FS_DEBUG 39#ifdef CONFIG_UBIFS_FS_DEBUG
39 40
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 401e503d44a1..87ebcce72213 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -104,14 +104,7 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
104 */ 104 */
105 inode->i_flags |= (S_NOCMTIME); 105 inode->i_flags |= (S_NOCMTIME);
106 106
107 inode->i_uid = current_fsuid(); 107 inode_init_owner(inode, dir, mode);
108 if (dir->i_mode & S_ISGID) {
109 inode->i_gid = dir->i_gid;
110 if (S_ISDIR(mode))
111 mode |= S_ISGID;
112 } else
113 inode->i_gid = current_fsgid();
114 inode->i_mode = mode;
115 inode->i_mtime = inode->i_atime = inode->i_ctime = 108 inode->i_mtime = inode->i_atime = inode->i_ctime =
116 ubifs_current_time(inode); 109 ubifs_current_time(inode);
117 inode->i_mapping->nrpages = 0; 110 inode->i_mapping->nrpages = 0;
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index e26c02ab6cd5..12f445cee9f7 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -52,6 +52,7 @@
52#include "ubifs.h" 52#include "ubifs.h"
53#include <linux/mount.h> 53#include <linux/mount.h>
54#include <linux/namei.h> 54#include <linux/namei.h>
55#include <linux/slab.h>
55 56
56static int read_block(struct inode *inode, void *addr, unsigned int block, 57static int read_block(struct inode *inode, void *addr, unsigned int block,
57 struct ubifs_data_node *dn) 58 struct ubifs_data_node *dn)
@@ -966,12 +967,15 @@ static int do_writepage(struct page *page, int len)
966 * the page locked, and it locks @ui_mutex. However, write-back does take inode 967 * the page locked, and it locks @ui_mutex. However, write-back does take inode
967 * @i_mutex, which means other VFS operations may be run on this inode at the 968 * @i_mutex, which means other VFS operations may be run on this inode at the
968 * same time. And the problematic one is truncation to smaller size, from where 969 * same time. And the problematic one is truncation to smaller size, from where
969 * we have to call 'vmtruncate()', which first changes @inode->i_size, then 970 * we have to call 'simple_setsize()', which first changes @inode->i_size, then
970 * drops the truncated pages. And while dropping the pages, it takes the page 971 * drops the truncated pages. And while dropping the pages, it takes the page
971 * lock. This means that 'do_truncation()' cannot call 'vmtruncate()' with 972 * lock. This means that 'do_truncation()' cannot call 'simple_setsize()' with
972 * @ui_mutex locked, because it would deadlock with 'ubifs_writepage()'. This 973 * @ui_mutex locked, because it would deadlock with 'ubifs_writepage()'. This
973 * means that @inode->i_size is changed while @ui_mutex is unlocked. 974 * means that @inode->i_size is changed while @ui_mutex is unlocked.
974 * 975 *
976 * XXX: with the new truncate the above is not true anymore, the simple_setsize
977 * calls can be replaced with the individual components.
978 *
975 * But in 'ubifs_writepage()' we have to guarantee that we do not write beyond 979 * But in 'ubifs_writepage()' we have to guarantee that we do not write beyond
976 * inode size. How do we do this if @inode->i_size may became smaller while we 980 * inode size. How do we do this if @inode->i_size may became smaller while we
977 * are in the middle of 'ubifs_writepage()'? The UBIFS solution is the 981 * are in the middle of 'ubifs_writepage()'? The UBIFS solution is the
@@ -1124,7 +1128,7 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
1124 budgeted = 0; 1128 budgeted = 0;
1125 } 1129 }
1126 1130
1127 err = vmtruncate(inode, new_size); 1131 err = simple_setsize(inode, new_size);
1128 if (err) 1132 if (err)
1129 goto out_budg; 1133 goto out_budg;
1130 1134
@@ -1213,7 +1217,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode,
1213 1217
1214 if (attr->ia_valid & ATTR_SIZE) { 1218 if (attr->ia_valid & ATTR_SIZE) {
1215 dbg_gen("size %lld -> %lld", inode->i_size, new_size); 1219 dbg_gen("size %lld -> %lld", inode->i_size, new_size);
1216 err = vmtruncate(inode, new_size); 1220 err = simple_setsize(inode, new_size);
1217 if (err) 1221 if (err)
1218 goto out; 1222 goto out;
1219 } 1223 }
@@ -1222,7 +1226,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode,
1222 if (attr->ia_valid & ATTR_SIZE) { 1226 if (attr->ia_valid & ATTR_SIZE) {
1223 /* Truncation changes inode [mc]time */ 1227 /* Truncation changes inode [mc]time */
1224 inode->i_mtime = inode->i_ctime = ubifs_current_time(inode); 1228 inode->i_mtime = inode->i_ctime = ubifs_current_time(inode);
1225 /* 'vmtruncate()' changed @i_size, update @ui_size */ 1229 /* 'simple_setsize()' changed @i_size, update @ui_size */
1226 ui->ui_size = inode->i_size; 1230 ui->ui_size = inode->i_size;
1227 } 1231 }
1228 1232
@@ -1303,9 +1307,9 @@ static void *ubifs_follow_link(struct dentry *dentry, struct nameidata *nd)
1303 return NULL; 1307 return NULL;
1304} 1308}
1305 1309
1306int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync) 1310int ubifs_fsync(struct file *file, int datasync)
1307{ 1311{
1308 struct inode *inode = dentry->d_inode; 1312 struct inode *inode = file->f_mapping->host;
1309 struct ubifs_info *c = inode->i_sb->s_fs_info; 1313 struct ubifs_info *c = inode->i_sb->s_fs_info;
1310 int err; 1314 int err;
1311 1315
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index e5a3d8e96bb7..918d1582ca05 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -53,6 +53,7 @@
53 * good, and GC takes extra care when moving them. 53 * good, and GC takes extra care when moving them.
54 */ 54 */
55 55
56#include <linux/slab.h>
56#include <linux/pagemap.h> 57#include <linux/pagemap.h>
57#include <linux/list_sort.h> 58#include <linux/list_sort.h>
58#include "ubifs.h" 59#include "ubifs.h"
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index e589fedaf1ef..bcf5a16f30bb 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -51,6 +51,7 @@
51 */ 51 */
52 52
53#include <linux/crc32.h> 53#include <linux/crc32.h>
54#include <linux/slab.h>
54#include "ubifs.h" 55#include "ubifs.h"
55 56
56/** 57/**
@@ -63,6 +64,7 @@ void ubifs_ro_mode(struct ubifs_info *c, int err)
63 if (!c->ro_media) { 64 if (!c->ro_media) {
64 c->ro_media = 1; 65 c->ro_media = 1;
65 c->no_chk_data_crc = 0; 66 c->no_chk_data_crc = 0;
67 c->vfs_sb->s_flags |= MS_RDONLY;
66 ubifs_warn("switched to read-only mode, error %d", err); 68 ubifs_warn("switched to read-only mode, error %d", err);
67 dbg_dump_stack(); 69 dbg_dump_stack();
68 } 70 }
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index b2792e84d245..ad7f67b827ea 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -46,6 +46,7 @@
46#include "ubifs.h" 46#include "ubifs.h"
47#include <linux/crc16.h> 47#include <linux/crc16.h>
48#include <linux/math64.h> 48#include <linux/math64.h>
49#include <linux/slab.h>
49 50
50/** 51/**
51 * do_calc_lpt_geom - calculate sizes for the LPT area. 52 * do_calc_lpt_geom - calculate sizes for the LPT area.
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 8cbfb8248025..13cb7a4237bf 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -26,6 +26,7 @@
26 */ 26 */
27 27
28#include <linux/crc16.h> 28#include <linux/crc16.h>
29#include <linux/slab.h>
29#include "ubifs.h" 30#include "ubifs.h"
30 31
31/** 32/**
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 868a55ee080f..109c6ea03bb5 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -31,6 +31,7 @@
31 */ 31 */
32 32
33#include <linux/crc32.h> 33#include <linux/crc32.h>
34#include <linux/slab.h>
34#include "ubifs.h" 35#include "ubifs.h"
35 36
36/** 37/**
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index 57085e43320f..96cb62c8a9dd 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -27,6 +27,7 @@
27 */ 27 */
28 28
29#include "ubifs.h" 29#include "ubifs.h"
30#include <linux/slab.h>
30#include <linux/random.h> 31#include <linux/random.h>
31#include <linux/math64.h> 32#include <linux/math64.h>
32 33
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index e5b1a7d00fa0..2194915220e5 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -31,6 +31,7 @@
31 */ 31 */
32 32
33#include <linux/crc32.h> 33#include <linux/crc32.h>
34#include <linux/slab.h>
34#include "ubifs.h" 35#include "ubifs.h"
35 36
36/* 37/*
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index b2d976366a46..2eef553d50c8 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -28,6 +28,7 @@
28#include <linux/fs.h> 28#include <linux/fs.h>
29#include <linux/err.h> 29#include <linux/err.h>
30#include <linux/sched.h> 30#include <linux/sched.h>
31#include <linux/slab.h>
31#include <linux/vmalloc.h> 32#include <linux/vmalloc.h>
32#include <linux/spinlock.h> 33#include <linux/spinlock.h>
33#include <linux/mutex.h> 34#include <linux/mutex.h>
@@ -378,7 +379,7 @@ struct ubifs_gced_idx_leb {
378 * The @ui_size is a "shadow" variable for @inode->i_size and UBIFS uses 379 * The @ui_size is a "shadow" variable for @inode->i_size and UBIFS uses
379 * @ui_size instead of @inode->i_size. The reason for this is that UBIFS cannot 380 * @ui_size instead of @inode->i_size. The reason for this is that UBIFS cannot
380 * make sure @inode->i_size is always changed under @ui_mutex, because it 381 * make sure @inode->i_size is always changed under @ui_mutex, because it
381 * cannot call 'vmtruncate()' with @ui_mutex locked, because it would deadlock 382 * cannot call 'simple_setsize()' with @ui_mutex locked, because it would deadlock
382 * with 'ubifs_writepage()' (see file.c). All the other inode fields are 383 * with 'ubifs_writepage()' (see file.c). All the other inode fields are
383 * changed under @ui_mutex, so they do not need "shadow" fields. Note, one 384 * changed under @ui_mutex, so they do not need "shadow" fields. Note, one
384 * could consider to rework locking and base it on "shadow" fields. 385 * could consider to rework locking and base it on "shadow" fields.
@@ -1677,7 +1678,7 @@ const struct ubifs_lprops *ubifs_fast_find_frdi_idx(struct ubifs_info *c);
1677int ubifs_calc_dark(const struct ubifs_info *c, int spc); 1678int ubifs_calc_dark(const struct ubifs_info *c, int spc);
1678 1679
1679/* file.c */ 1680/* file.c */
1680int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync); 1681int ubifs_fsync(struct file *file, int datasync);
1681int ubifs_setattr(struct dentry *dentry, struct iattr *attr); 1682int ubifs_setattr(struct dentry *dentry, struct iattr *attr);
1682 1683
1683/* dir.c */ 1684/* dir.c */
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index 195830f47569..c74400f88fe0 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -56,6 +56,7 @@
56 */ 56 */
57 57
58#include "ubifs.h" 58#include "ubifs.h"
59#include <linux/slab.h>
59#include <linux/xattr.h> 60#include <linux/xattr.h>
60#include <linux/posix_acl_xattr.h> 61#include <linux/posix_acl_xattr.h>
61 62
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index 19626e2491c4..b608efaa4cee 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -21,7 +21,6 @@
21 21
22#include "udfdecl.h" 22#include "udfdecl.h"
23 23
24#include <linux/quotaops.h>
25#include <linux/buffer_head.h> 24#include <linux/buffer_head.h>
26#include <linux/bitops.h> 25#include <linux/bitops.h>
27 26
@@ -125,9 +124,8 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
125 124
126 mutex_lock(&sbi->s_alloc_mutex); 125 mutex_lock(&sbi->s_alloc_mutex);
127 partmap = &sbi->s_partmaps[bloc->partitionReferenceNum]; 126 partmap = &sbi->s_partmaps[bloc->partitionReferenceNum];
128 if (bloc->logicalBlockNum < 0 || 127 if (bloc->logicalBlockNum + count < count ||
129 (bloc->logicalBlockNum + count) > 128 (bloc->logicalBlockNum + count) > partmap->s_partition_len) {
130 partmap->s_partition_len) {
131 udf_debug("%d < %d || %d + %d > %d\n", 129 udf_debug("%d < %d || %d + %d > %d\n",
132 bloc->logicalBlockNum, 0, bloc->logicalBlockNum, 130 bloc->logicalBlockNum, 0, bloc->logicalBlockNum,
133 count, partmap->s_partition_len); 131 count, partmap->s_partition_len);
@@ -160,8 +158,6 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
160 udf_debug("byte=%2x\n", 158 udf_debug("byte=%2x\n",
161 ((char *)bh->b_data)[(bit + i) >> 3]); 159 ((char *)bh->b_data)[(bit + i) >> 3]);
162 } else { 160 } else {
163 if (inode)
164 dquot_free_block(inode, 1);
165 udf_add_free_space(sb, sbi->s_partition, 1); 161 udf_add_free_space(sb, sbi->s_partition, 1);
166 } 162 }
167 } 163 }
@@ -211,15 +207,8 @@ static int udf_bitmap_prealloc_blocks(struct super_block *sb,
211 bit = block % (sb->s_blocksize << 3); 207 bit = block % (sb->s_blocksize << 3);
212 208
213 while (bit < (sb->s_blocksize << 3) && block_count > 0) { 209 while (bit < (sb->s_blocksize << 3) && block_count > 0) {
214 if (!udf_test_bit(bit, bh->b_data)) 210 if (!udf_clear_bit(bit, bh->b_data))
215 goto out; 211 goto out;
216 else if (dquot_prealloc_block(inode, 1))
217 goto out;
218 else if (!udf_clear_bit(bit, bh->b_data)) {
219 udf_debug("bit already cleared for block %d\n", bit);
220 dquot_free_block(inode, 1);
221 goto out;
222 }
223 block_count--; 212 block_count--;
224 alloc_count++; 213 alloc_count++;
225 bit++; 214 bit++;
@@ -339,20 +328,6 @@ search_back:
339 } 328 }
340 329
341got_block: 330got_block:
342
343 /*
344 * Check quota for allocation of this block.
345 */
346 if (inode) {
347 int ret = dquot_alloc_block(inode, 1);
348
349 if (ret) {
350 mutex_unlock(&sbi->s_alloc_mutex);
351 *err = ret;
352 return 0;
353 }
354 }
355
356 newblock = bit + (block_group << (sb->s_blocksize_bits + 3)) - 331 newblock = bit + (block_group << (sb->s_blocksize_bits + 3)) -
357 (sizeof(struct spaceBitmapDesc) << 3); 332 (sizeof(struct spaceBitmapDesc) << 3);
358 333
@@ -393,9 +368,8 @@ static void udf_table_free_blocks(struct super_block *sb,
393 368
394 mutex_lock(&sbi->s_alloc_mutex); 369 mutex_lock(&sbi->s_alloc_mutex);
395 partmap = &sbi->s_partmaps[bloc->partitionReferenceNum]; 370 partmap = &sbi->s_partmaps[bloc->partitionReferenceNum];
396 if (bloc->logicalBlockNum < 0 || 371 if (bloc->logicalBlockNum + count < count ||
397 (bloc->logicalBlockNum + count) > 372 (bloc->logicalBlockNum + count) > partmap->s_partition_len) {
398 partmap->s_partition_len) {
399 udf_debug("%d < %d || %d + %d > %d\n", 373 udf_debug("%d < %d || %d + %d > %d\n",
400 bloc->logicalBlockNum, 0, bloc->logicalBlockNum, count, 374 bloc->logicalBlockNum, 0, bloc->logicalBlockNum, count,
401 partmap->s_partition_len); 375 partmap->s_partition_len);
@@ -403,10 +377,6 @@ static void udf_table_free_blocks(struct super_block *sb,
403 } 377 }
404 378
405 iinfo = UDF_I(table); 379 iinfo = UDF_I(table);
406 /* We do this up front - There are some error conditions that
407 could occure, but.. oh well */
408 if (inode)
409 dquot_free_block(inode, count);
410 udf_add_free_space(sb, sbi->s_partition, count); 380 udf_add_free_space(sb, sbi->s_partition, count);
411 381
412 start = bloc->logicalBlockNum + offset; 382 start = bloc->logicalBlockNum + offset;
@@ -651,10 +621,7 @@ static int udf_table_prealloc_blocks(struct super_block *sb,
651 epos.offset -= adsize; 621 epos.offset -= adsize;
652 622
653 alloc_count = (elen >> sb->s_blocksize_bits); 623 alloc_count = (elen >> sb->s_blocksize_bits);
654 if (inode && dquot_prealloc_block(inode, 624 if (alloc_count > block_count) {
655 alloc_count > block_count ? block_count : alloc_count))
656 alloc_count = 0;
657 else if (alloc_count > block_count) {
658 alloc_count = block_count; 625 alloc_count = block_count;
659 eloc.logicalBlockNum += alloc_count; 626 eloc.logicalBlockNum += alloc_count;
660 elen -= (alloc_count << sb->s_blocksize_bits); 627 elen -= (alloc_count << sb->s_blocksize_bits);
@@ -754,14 +721,6 @@ static int udf_table_new_block(struct super_block *sb,
754 newblock = goal_eloc.logicalBlockNum; 721 newblock = goal_eloc.logicalBlockNum;
755 goal_eloc.logicalBlockNum++; 722 goal_eloc.logicalBlockNum++;
756 goal_elen -= sb->s_blocksize; 723 goal_elen -= sb->s_blocksize;
757 if (inode) {
758 *err = dquot_alloc_block(inode, 1);
759 if (*err) {
760 brelse(goal_epos.bh);
761 mutex_unlock(&sbi->s_alloc_mutex);
762 return 0;
763 }
764 }
765 724
766 if (goal_elen) 725 if (goal_elen)
767 udf_write_aext(table, &goal_epos, &goal_eloc, goal_elen, 1); 726 udf_write_aext(table, &goal_epos, &goal_eloc, goal_elen, 1);
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index f0f2a436251e..51552bf50225 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -207,8 +207,9 @@ static int udf_readdir(struct file *filp, void *dirent, filldir_t filldir)
207 207
208/* readdir and lookup functions */ 208/* readdir and lookup functions */
209const struct file_operations udf_dir_operations = { 209const struct file_operations udf_dir_operations = {
210 .llseek = generic_file_llseek,
210 .read = generic_read_dir, 211 .read = generic_read_dir,
211 .readdir = udf_readdir, 212 .readdir = udf_readdir,
212 .ioctl = udf_ioctl, 213 .unlocked_ioctl = udf_ioctl,
213 .fsync = simple_fsync, 214 .fsync = generic_file_fsync,
214}; 215};
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 1eb06774ed90..94e06d6bddbd 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -34,9 +34,9 @@
34#include <linux/errno.h> 34#include <linux/errno.h>
35#include <linux/smp_lock.h> 35#include <linux/smp_lock.h>
36#include <linux/pagemap.h> 36#include <linux/pagemap.h>
37#include <linux/quotaops.h>
38#include <linux/buffer_head.h> 37#include <linux/buffer_head.h>
39#include <linux/aio.h> 38#include <linux/aio.h>
39#include <linux/smp_lock.h>
40 40
41#include "udf_i.h" 41#include "udf_i.h"
42#include "udf_sb.h" 42#include "udf_sb.h"
@@ -144,50 +144,60 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
144 return retval; 144 return retval;
145} 145}
146 146
147int udf_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, 147long udf_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
148 unsigned long arg)
149{ 148{
149 struct inode *inode = filp->f_dentry->d_inode;
150 long old_block, new_block; 150 long old_block, new_block;
151 int result = -EINVAL; 151 int result = -EINVAL;
152 152
153 lock_kernel();
154
153 if (file_permission(filp, MAY_READ) != 0) { 155 if (file_permission(filp, MAY_READ) != 0) {
154 udf_debug("no permission to access inode %lu\n", 156 udf_debug("no permission to access inode %lu\n", inode->i_ino);
155 inode->i_ino); 157 result = -EPERM;
156 return -EPERM; 158 goto out;
157 } 159 }
158 160
159 if (!arg) { 161 if (!arg) {
160 udf_debug("invalid argument to udf_ioctl\n"); 162 udf_debug("invalid argument to udf_ioctl\n");
161 return -EINVAL; 163 result = -EINVAL;
164 goto out;
162 } 165 }
163 166
164 switch (cmd) { 167 switch (cmd) {
165 case UDF_GETVOLIDENT: 168 case UDF_GETVOLIDENT:
166 if (copy_to_user((char __user *)arg, 169 if (copy_to_user((char __user *)arg,
167 UDF_SB(inode->i_sb)->s_volume_ident, 32)) 170 UDF_SB(inode->i_sb)->s_volume_ident, 32))
168 return -EFAULT; 171 result = -EFAULT;
169 else 172 else
170 return 0; 173 result = 0;
174 goto out;
171 case UDF_RELOCATE_BLOCKS: 175 case UDF_RELOCATE_BLOCKS:
172 if (!capable(CAP_SYS_ADMIN)) 176 if (!capable(CAP_SYS_ADMIN)) {
173 return -EACCES; 177 result = -EACCES;
174 if (get_user(old_block, (long __user *)arg)) 178 goto out;
175 return -EFAULT; 179 }
180 if (get_user(old_block, (long __user *)arg)) {
181 result = -EFAULT;
182 goto out;
183 }
176 result = udf_relocate_blocks(inode->i_sb, 184 result = udf_relocate_blocks(inode->i_sb,
177 old_block, &new_block); 185 old_block, &new_block);
178 if (result == 0) 186 if (result == 0)
179 result = put_user(new_block, (long __user *)arg); 187 result = put_user(new_block, (long __user *)arg);
180 return result; 188 goto out;
181 case UDF_GETEASIZE: 189 case UDF_GETEASIZE:
182 result = put_user(UDF_I(inode)->i_lenEAttr, (int __user *)arg); 190 result = put_user(UDF_I(inode)->i_lenEAttr, (int __user *)arg);
183 break; 191 goto out;
184 case UDF_GETEABLOCK: 192 case UDF_GETEABLOCK:
185 result = copy_to_user((char __user *)arg, 193 result = copy_to_user((char __user *)arg,
186 UDF_I(inode)->i_ext.i_data, 194 UDF_I(inode)->i_ext.i_data,
187 UDF_I(inode)->i_lenEAttr) ? -EFAULT : 0; 195 UDF_I(inode)->i_lenEAttr) ? -EFAULT : 0;
188 break; 196 goto out;
189 } 197 }
190 198
199out:
200 unlock_kernel();
191 return result; 201 return result;
192} 202}
193 203
@@ -207,40 +217,17 @@ static int udf_release_file(struct inode *inode, struct file *filp)
207const struct file_operations udf_file_operations = { 217const struct file_operations udf_file_operations = {
208 .read = do_sync_read, 218 .read = do_sync_read,
209 .aio_read = generic_file_aio_read, 219 .aio_read = generic_file_aio_read,
210 .ioctl = udf_ioctl, 220 .unlocked_ioctl = udf_ioctl,
211 .open = dquot_file_open, 221 .open = generic_file_open,
212 .mmap = generic_file_mmap, 222 .mmap = generic_file_mmap,
213 .write = do_sync_write, 223 .write = do_sync_write,
214 .aio_write = udf_file_aio_write, 224 .aio_write = udf_file_aio_write,
215 .release = udf_release_file, 225 .release = udf_release_file,
216 .fsync = simple_fsync, 226 .fsync = generic_file_fsync,
217 .splice_read = generic_file_splice_read, 227 .splice_read = generic_file_splice_read,
218 .llseek = generic_file_llseek, 228 .llseek = generic_file_llseek,
219}; 229};
220 230
221static int udf_setattr(struct dentry *dentry, struct iattr *iattr)
222{
223 struct inode *inode = dentry->d_inode;
224 int error;
225
226 error = inode_change_ok(inode, iattr);
227 if (error)
228 return error;
229
230 if (iattr->ia_valid & ATTR_SIZE)
231 dquot_initialize(inode);
232
233 if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
234 (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
235 error = dquot_transfer(inode, iattr);
236 if (error)
237 return error;
238 }
239
240 return inode_setattr(inode, iattr);
241}
242
243const struct inode_operations udf_file_inode_operations = { 231const struct inode_operations udf_file_inode_operations = {
244 .truncate = udf_truncate, 232 .truncate = udf_truncate,
245 .setattr = udf_setattr,
246}; 233};
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index fb68c9cd0c3e..18cd7111185d 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -20,7 +20,6 @@
20 20
21#include "udfdecl.h" 21#include "udfdecl.h"
22#include <linux/fs.h> 22#include <linux/fs.h>
23#include <linux/quotaops.h>
24#include <linux/sched.h> 23#include <linux/sched.h>
25#include <linux/slab.h> 24#include <linux/slab.h>
26 25
@@ -32,13 +31,6 @@ void udf_free_inode(struct inode *inode)
32 struct super_block *sb = inode->i_sb; 31 struct super_block *sb = inode->i_sb;
33 struct udf_sb_info *sbi = UDF_SB(sb); 32 struct udf_sb_info *sbi = UDF_SB(sb);
34 33
35 /*
36 * Note: we must free any quota before locking the superblock,
37 * as writing the quota to disk may need the lock as well.
38 */
39 dquot_free_inode(inode);
40 dquot_drop(inode);
41
42 clear_inode(inode); 34 clear_inode(inode);
43 35
44 mutex_lock(&sbi->s_alloc_mutex); 36 mutex_lock(&sbi->s_alloc_mutex);
@@ -61,7 +53,7 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
61 struct super_block *sb = dir->i_sb; 53 struct super_block *sb = dir->i_sb;
62 struct udf_sb_info *sbi = UDF_SB(sb); 54 struct udf_sb_info *sbi = UDF_SB(sb);
63 struct inode *inode; 55 struct inode *inode;
64 int block, ret; 56 int block;
65 uint32_t start = UDF_I(dir)->i_location.logicalBlockNum; 57 uint32_t start = UDF_I(dir)->i_location.logicalBlockNum;
66 struct udf_inode_info *iinfo; 58 struct udf_inode_info *iinfo;
67 struct udf_inode_info *dinfo = UDF_I(dir); 59 struct udf_inode_info *dinfo = UDF_I(dir);
@@ -124,15 +116,8 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
124 udf_updated_lvid(sb); 116 udf_updated_lvid(sb);
125 } 117 }
126 mutex_unlock(&sbi->s_alloc_mutex); 118 mutex_unlock(&sbi->s_alloc_mutex);
127 inode->i_mode = mode; 119
128 inode->i_uid = current_fsuid(); 120 inode_init_owner(inode, dir, mode);
129 if (dir->i_mode & S_ISGID) {
130 inode->i_gid = dir->i_gid;
131 if (S_ISDIR(mode))
132 mode |= S_ISGID;
133 } else {
134 inode->i_gid = current_fsgid();
135 }
136 121
137 iinfo->i_location.logicalBlockNum = block; 122 iinfo->i_location.logicalBlockNum = block;
138 iinfo->i_location.partitionReferenceNum = 123 iinfo->i_location.partitionReferenceNum =
@@ -153,17 +138,6 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
153 insert_inode_hash(inode); 138 insert_inode_hash(inode);
154 mark_inode_dirty(inode); 139 mark_inode_dirty(inode);
155 140
156 dquot_initialize(inode);
157 ret = dquot_alloc_inode(inode);
158 if (ret) {
159 dquot_drop(inode);
160 inode->i_flags |= S_NOQUOTA;
161 inode->i_nlink = 0;
162 iput(inode);
163 *err = ret;
164 return NULL;
165 }
166
167 *err = 0; 141 *err = 0;
168 return inode; 142 return inode;
169} 143}
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index bb863fe579ac..124852bcf6fe 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -36,7 +36,6 @@
36#include <linux/pagemap.h> 36#include <linux/pagemap.h>
37#include <linux/buffer_head.h> 37#include <linux/buffer_head.h>
38#include <linux/writeback.h> 38#include <linux/writeback.h>
39#include <linux/quotaops.h>
40#include <linux/slab.h> 39#include <linux/slab.h>
41#include <linux/crc-itu-t.h> 40#include <linux/crc-itu-t.h>
42 41
@@ -71,9 +70,6 @@ static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int);
71 70
72void udf_delete_inode(struct inode *inode) 71void udf_delete_inode(struct inode *inode)
73{ 72{
74 if (!is_bad_inode(inode))
75 dquot_initialize(inode);
76
77 truncate_inode_pages(&inode->i_data, 0); 73 truncate_inode_pages(&inode->i_data, 0);
78 74
79 if (is_bad_inode(inode)) 75 if (is_bad_inode(inode))
@@ -113,7 +109,6 @@ void udf_clear_inode(struct inode *inode)
113 (unsigned long long)iinfo->i_lenExtents); 109 (unsigned long long)iinfo->i_lenExtents);
114 } 110 }
115 111
116 dquot_drop(inode);
117 kfree(iinfo->i_ext.i_data); 112 kfree(iinfo->i_ext.i_data);
118 iinfo->i_ext.i_data = NULL; 113 iinfo->i_ext.i_data = NULL;
119} 114}
@@ -1314,7 +1309,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
1314 break; 1309 break;
1315 case ICBTAG_FILE_TYPE_SYMLINK: 1310 case ICBTAG_FILE_TYPE_SYMLINK:
1316 inode->i_data.a_ops = &udf_symlink_aops; 1311 inode->i_data.a_ops = &udf_symlink_aops;
1317 inode->i_op = &page_symlink_inode_operations; 1312 inode->i_op = &udf_symlink_inode_operations;
1318 inode->i_mode = S_IFLNK | S_IRWXUGO; 1313 inode->i_mode = S_IFLNK | S_IRWXUGO;
1319 break; 1314 break;
1320 case ICBTAG_FILE_TYPE_MAIN: 1315 case ICBTAG_FILE_TYPE_MAIN:
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index db423ab078b1..bf5fc674193c 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -27,7 +27,6 @@
27#include <linux/errno.h> 27#include <linux/errno.h>
28#include <linux/mm.h> 28#include <linux/mm.h>
29#include <linux/slab.h> 29#include <linux/slab.h>
30#include <linux/quotaops.h>
31#include <linux/smp_lock.h> 30#include <linux/smp_lock.h>
32#include <linux/buffer_head.h> 31#include <linux/buffer_head.h>
33#include <linux/sched.h> 32#include <linux/sched.h>
@@ -563,8 +562,6 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode,
563 int err; 562 int err;
564 struct udf_inode_info *iinfo; 563 struct udf_inode_info *iinfo;
565 564
566 dquot_initialize(dir);
567
568 lock_kernel(); 565 lock_kernel();
569 inode = udf_new_inode(dir, mode, &err); 566 inode = udf_new_inode(dir, mode, &err);
570 if (!inode) { 567 if (!inode) {
@@ -579,7 +576,6 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode,
579 inode->i_data.a_ops = &udf_aops; 576 inode->i_data.a_ops = &udf_aops;
580 inode->i_op = &udf_file_inode_operations; 577 inode->i_op = &udf_file_inode_operations;
581 inode->i_fop = &udf_file_operations; 578 inode->i_fop = &udf_file_operations;
582 inode->i_mode = mode;
583 mark_inode_dirty(inode); 579 mark_inode_dirty(inode);
584 580
585 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); 581 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
@@ -618,8 +614,6 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode,
618 if (!old_valid_dev(rdev)) 614 if (!old_valid_dev(rdev))
619 return -EINVAL; 615 return -EINVAL;
620 616
621 dquot_initialize(dir);
622
623 lock_kernel(); 617 lock_kernel();
624 err = -EIO; 618 err = -EIO;
625 inode = udf_new_inode(dir, mode, &err); 619 inode = udf_new_inode(dir, mode, &err);
@@ -627,7 +621,6 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode,
627 goto out; 621 goto out;
628 622
629 iinfo = UDF_I(inode); 623 iinfo = UDF_I(inode);
630 inode->i_uid = current_fsuid();
631 init_special_inode(inode, mode, rdev); 624 init_special_inode(inode, mode, rdev);
632 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); 625 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
633 if (!fi) { 626 if (!fi) {
@@ -666,15 +659,13 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
666 struct udf_inode_info *dinfo = UDF_I(dir); 659 struct udf_inode_info *dinfo = UDF_I(dir);
667 struct udf_inode_info *iinfo; 660 struct udf_inode_info *iinfo;
668 661
669 dquot_initialize(dir);
670
671 lock_kernel(); 662 lock_kernel();
672 err = -EMLINK; 663 err = -EMLINK;
673 if (dir->i_nlink >= (256 << sizeof(dir->i_nlink)) - 1) 664 if (dir->i_nlink >= (256 << sizeof(dir->i_nlink)) - 1)
674 goto out; 665 goto out;
675 666
676 err = -EIO; 667 err = -EIO;
677 inode = udf_new_inode(dir, S_IFDIR, &err); 668 inode = udf_new_inode(dir, S_IFDIR | mode, &err);
678 if (!inode) 669 if (!inode)
679 goto out; 670 goto out;
680 671
@@ -697,9 +688,6 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
697 FID_FILE_CHAR_DIRECTORY | FID_FILE_CHAR_PARENT; 688 FID_FILE_CHAR_DIRECTORY | FID_FILE_CHAR_PARENT;
698 udf_write_fi(inode, &cfi, fi, &fibh, NULL, NULL); 689 udf_write_fi(inode, &cfi, fi, &fibh, NULL, NULL);
699 brelse(fibh.sbh); 690 brelse(fibh.sbh);
700 inode->i_mode = S_IFDIR | mode;
701 if (dir->i_mode & S_ISGID)
702 inode->i_mode |= S_ISGID;
703 mark_inode_dirty(inode); 691 mark_inode_dirty(inode);
704 692
705 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); 693 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
@@ -805,8 +793,6 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
805 struct fileIdentDesc *fi, cfi; 793 struct fileIdentDesc *fi, cfi;
806 struct kernel_lb_addr tloc; 794 struct kernel_lb_addr tloc;
807 795
808 dquot_initialize(dir);
809
810 retval = -ENOENT; 796 retval = -ENOENT;
811 lock_kernel(); 797 lock_kernel();
812 fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi); 798 fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
@@ -853,8 +839,6 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry)
853 struct fileIdentDesc cfi; 839 struct fileIdentDesc cfi;
854 struct kernel_lb_addr tloc; 840 struct kernel_lb_addr tloc;
855 841
856 dquot_initialize(dir);
857
858 retval = -ENOENT; 842 retval = -ENOENT;
859 lock_kernel(); 843 lock_kernel();
860 fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi); 844 fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
@@ -909,10 +893,8 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
909 struct buffer_head *bh; 893 struct buffer_head *bh;
910 struct udf_inode_info *iinfo; 894 struct udf_inode_info *iinfo;
911 895
912 dquot_initialize(dir);
913
914 lock_kernel(); 896 lock_kernel();
915 inode = udf_new_inode(dir, S_IFLNK, &err); 897 inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO, &err);
916 if (!inode) 898 if (!inode)
917 goto out; 899 goto out;
918 900
@@ -923,9 +905,8 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
923 } 905 }
924 906
925 iinfo = UDF_I(inode); 907 iinfo = UDF_I(inode);
926 inode->i_mode = S_IFLNK | S_IRWXUGO;
927 inode->i_data.a_ops = &udf_symlink_aops; 908 inode->i_data.a_ops = &udf_symlink_aops;
928 inode->i_op = &page_symlink_inode_operations; 909 inode->i_op = &udf_symlink_inode_operations;
929 910
930 if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { 911 if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
931 struct kernel_lb_addr eloc; 912 struct kernel_lb_addr eloc;
@@ -1081,8 +1062,6 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,
1081 int err; 1062 int err;
1082 struct buffer_head *bh; 1063 struct buffer_head *bh;
1083 1064
1084 dquot_initialize(dir);
1085
1086 lock_kernel(); 1065 lock_kernel();
1087 if (inode->i_nlink >= (256 << sizeof(inode->i_nlink)) - 1) { 1066 if (inode->i_nlink >= (256 << sizeof(inode->i_nlink)) - 1) {
1088 unlock_kernel(); 1067 unlock_kernel();
@@ -1145,9 +1124,6 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
1145 struct kernel_lb_addr tloc; 1124 struct kernel_lb_addr tloc;
1146 struct udf_inode_info *old_iinfo = UDF_I(old_inode); 1125 struct udf_inode_info *old_iinfo = UDF_I(old_inode);
1147 1126
1148 dquot_initialize(old_dir);
1149 dquot_initialize(new_dir);
1150
1151 lock_kernel(); 1127 lock_kernel();
1152 ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi); 1128 ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi);
1153 if (ofi) { 1129 if (ofi) {
@@ -1401,3 +1377,8 @@ const struct inode_operations udf_dir_inode_operations = {
1401 .mknod = udf_mknod, 1377 .mknod = udf_mknod,
1402 .rename = udf_rename, 1378 .rename = udf_rename,
1403}; 1379};
1380const struct inode_operations udf_symlink_inode_operations = {
1381 .readlink = generic_readlink,
1382 .follow_link = page_follow_link_light,
1383 .put_link = page_put_link,
1384};
diff --git a/fs/udf/partition.c b/fs/udf/partition.c
index 4b540ee632d5..745eb209be0c 100644
--- a/fs/udf/partition.c
+++ b/fs/udf/partition.c
@@ -24,7 +24,6 @@
24 24
25#include <linux/fs.h> 25#include <linux/fs.h>
26#include <linux/string.h> 26#include <linux/string.h>
27#include <linux/slab.h>
28#include <linux/buffer_head.h> 27#include <linux/buffer_head.h>
29 28
30uint32_t udf_get_pblock(struct super_block *sb, uint32_t block, 29uint32_t udf_get_pblock(struct super_block *sb, uint32_t block,
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 1e4543cbcd27..612d1e2e285a 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -557,6 +557,7 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
557{ 557{
558 struct udf_options uopt; 558 struct udf_options uopt;
559 struct udf_sb_info *sbi = UDF_SB(sb); 559 struct udf_sb_info *sbi = UDF_SB(sb);
560 int error = 0;
560 561
561 uopt.flags = sbi->s_flags; 562 uopt.flags = sbi->s_flags;
562 uopt.uid = sbi->s_uid; 563 uopt.uid = sbi->s_uid;
@@ -582,17 +583,17 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
582 *flags |= MS_RDONLY; 583 *flags |= MS_RDONLY;
583 } 584 }
584 585
585 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) { 586 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
586 unlock_kernel(); 587 goto out_unlock;
587 return 0; 588
588 }
589 if (*flags & MS_RDONLY) 589 if (*flags & MS_RDONLY)
590 udf_close_lvid(sb); 590 udf_close_lvid(sb);
591 else 591 else
592 udf_open_lvid(sb); 592 udf_open_lvid(sb);
593 593
594out_unlock:
594 unlock_kernel(); 595 unlock_kernel();
595 return 0; 596 return error;
596} 597}
597 598
598/* Check Volume Structure Descriptors (ECMA 167 2/9.1) */ 599/* Check Volume Structure Descriptors (ECMA 167 2/9.1) */
@@ -1939,7 +1940,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
1939 /* Fill in the rest of the superblock */ 1940 /* Fill in the rest of the superblock */
1940 sb->s_op = &udf_sb_ops; 1941 sb->s_op = &udf_sb_ops;
1941 sb->s_export_op = &udf_export_ops; 1942 sb->s_export_op = &udf_export_ops;
1942 sb->dq_op = NULL; 1943
1943 sb->s_dirt = 0; 1944 sb->s_dirt = 0;
1944 sb->s_magic = UDF_SUPER_MAGIC; 1945 sb->s_magic = UDF_SUPER_MAGIC;
1945 sb->s_time_gran = 1000; 1946 sb->s_time_gran = 1000;
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index 852e91845688..16064787d2b7 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -26,7 +26,6 @@
26#include <linux/time.h> 26#include <linux/time.h>
27#include <linux/mm.h> 27#include <linux/mm.h>
28#include <linux/stat.h> 28#include <linux/stat.h>
29#include <linux/slab.h>
30#include <linux/pagemap.h> 29#include <linux/pagemap.h>
31#include <linux/smp_lock.h> 30#include <linux/smp_lock.h>
32#include <linux/buffer_head.h> 31#include <linux/buffer_head.h>
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 4223ac855da9..2bac0354891f 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -76,6 +76,7 @@ extern const struct inode_operations udf_dir_inode_operations;
76extern const struct file_operations udf_dir_operations; 76extern const struct file_operations udf_dir_operations;
77extern const struct inode_operations udf_file_inode_operations; 77extern const struct inode_operations udf_file_inode_operations;
78extern const struct file_operations udf_file_operations; 78extern const struct file_operations udf_file_operations;
79extern const struct inode_operations udf_symlink_inode_operations;
79extern const struct address_space_operations udf_aops; 80extern const struct address_space_operations udf_aops;
80extern const struct address_space_operations udf_adinicb_aops; 81extern const struct address_space_operations udf_adinicb_aops;
81extern const struct address_space_operations udf_symlink_aops; 82extern const struct address_space_operations udf_symlink_aops;
@@ -129,9 +130,7 @@ extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *,
129 uint8_t *, uint8_t *); 130 uint8_t *, uint8_t *);
130 131
131/* file.c */ 132/* file.c */
132extern int udf_ioctl(struct inode *, struct file *, unsigned int, 133extern long udf_ioctl(struct file *, unsigned int, unsigned long);
133 unsigned long);
134
135/* inode.c */ 134/* inode.c */
136extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *); 135extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *);
137extern int udf_sync_inode(struct inode *); 136extern int udf_sync_inode(struct inode *);
diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index cefa8c8913e6..d03a90b6ad69 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -24,6 +24,7 @@
24#include <linux/string.h> /* for memset */ 24#include <linux/string.h> /* for memset */
25#include <linux/nls.h> 25#include <linux/nls.h>
26#include <linux/crc-itu-t.h> 26#include <linux/crc-itu-t.h>
27#include <linux/slab.h>
27 28
28#include "udf_sb.h" 29#include "udf_sb.h"
29 30
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 5cfa4d85ccf2..048484fb10d2 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -12,7 +12,6 @@
12#include <linux/stat.h> 12#include <linux/stat.h>
13#include <linux/time.h> 13#include <linux/time.h>
14#include <linux/string.h> 14#include <linux/string.h>
15#include <linux/quotaops.h>
16#include <linux/buffer_head.h> 15#include <linux/buffer_head.h>
17#include <linux/capability.h> 16#include <linux/capability.h>
18#include <linux/bitops.h> 17#include <linux/bitops.h>
@@ -85,9 +84,6 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
85 "bit already cleared for fragment %u", i); 84 "bit already cleared for fragment %u", i);
86 } 85 }
87 86
88 dquot_free_block(inode, count);
89
90
91 fs32_add(sb, &ucg->cg_cs.cs_nffree, count); 87 fs32_add(sb, &ucg->cg_cs.cs_nffree, count);
92 uspi->cs_total.cs_nffree += count; 88 uspi->cs_total.cs_nffree += count;
93 fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count); 89 fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count);
@@ -195,7 +191,6 @@ do_more:
195 ubh_setblock(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno); 191 ubh_setblock(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
196 if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD) 192 if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
197 ufs_clusteracct (sb, ucpi, blkno, 1); 193 ufs_clusteracct (sb, ucpi, blkno, 1);
198 dquot_free_block(inode, uspi->s_fpb);
199 194
200 fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1); 195 fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1);
201 uspi->cs_total.cs_nbfree++; 196 uspi->cs_total.cs_nbfree++;
@@ -511,7 +506,6 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
511 struct ufs_cg_private_info * ucpi; 506 struct ufs_cg_private_info * ucpi;
512 struct ufs_cylinder_group * ucg; 507 struct ufs_cylinder_group * ucg;
513 unsigned cgno, fragno, fragoff, count, fragsize, i; 508 unsigned cgno, fragno, fragoff, count, fragsize, i;
514 int ret;
515 509
516 UFSD("ENTER, fragment %llu, oldcount %u, newcount %u\n", 510 UFSD("ENTER, fragment %llu, oldcount %u, newcount %u\n",
517 (unsigned long long)fragment, oldcount, newcount); 511 (unsigned long long)fragment, oldcount, newcount);
@@ -557,11 +551,6 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
557 fs32_add(sb, &ucg->cg_frsum[fragsize - count], 1); 551 fs32_add(sb, &ucg->cg_frsum[fragsize - count], 1);
558 for (i = oldcount; i < newcount; i++) 552 for (i = oldcount; i < newcount; i++)
559 ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i); 553 ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i);
560 ret = dquot_alloc_block(inode, count);
561 if (ret) {
562 *err = ret;
563 return 0;
564 }
565 554
566 fs32_sub(sb, &ucg->cg_cs.cs_nffree, count); 555 fs32_sub(sb, &ucg->cg_cs.cs_nffree, count);
567 fs32_sub(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count); 556 fs32_sub(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count);
@@ -598,7 +587,6 @@ static u64 ufs_alloc_fragments(struct inode *inode, unsigned cgno,
598 struct ufs_cylinder_group * ucg; 587 struct ufs_cylinder_group * ucg;
599 unsigned oldcg, i, j, k, allocsize; 588 unsigned oldcg, i, j, k, allocsize;
600 u64 result; 589 u64 result;
601 int ret;
602 590
603 UFSD("ENTER, ino %lu, cgno %u, goal %llu, count %u\n", 591 UFSD("ENTER, ino %lu, cgno %u, goal %llu, count %u\n",
604 inode->i_ino, cgno, (unsigned long long)goal, count); 592 inode->i_ino, cgno, (unsigned long long)goal, count);
@@ -667,7 +655,6 @@ cg_found:
667 for (i = count; i < uspi->s_fpb; i++) 655 for (i = count; i < uspi->s_fpb; i++)
668 ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, goal + i); 656 ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, goal + i);
669 i = uspi->s_fpb - count; 657 i = uspi->s_fpb - count;
670 dquot_free_block(inode, i);
671 658
672 fs32_add(sb, &ucg->cg_cs.cs_nffree, i); 659 fs32_add(sb, &ucg->cg_cs.cs_nffree, i);
673 uspi->cs_total.cs_nffree += i; 660 uspi->cs_total.cs_nffree += i;
@@ -679,11 +666,6 @@ cg_found:
679 result = ufs_bitmap_search (sb, ucpi, goal, allocsize); 666 result = ufs_bitmap_search (sb, ucpi, goal, allocsize);
680 if (result == INVBLOCK) 667 if (result == INVBLOCK)
681 return 0; 668 return 0;
682 ret = dquot_alloc_block(inode, count);
683 if (ret) {
684 *err = ret;
685 return 0;
686 }
687 for (i = 0; i < count; i++) 669 for (i = 0; i < count; i++)
688 ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, result + i); 670 ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, result + i);
689 671
@@ -718,7 +700,6 @@ static u64 ufs_alloccg_block(struct inode *inode,
718 struct ufs_super_block_first * usb1; 700 struct ufs_super_block_first * usb1;
719 struct ufs_cylinder_group * ucg; 701 struct ufs_cylinder_group * ucg;
720 u64 result, blkno; 702 u64 result, blkno;
721 int ret;
722 703
723 UFSD("ENTER, goal %llu\n", (unsigned long long)goal); 704 UFSD("ENTER, goal %llu\n", (unsigned long long)goal);
724 705
@@ -752,11 +733,6 @@ gotit:
752 ubh_clrblock (UCPI_UBH(ucpi), ucpi->c_freeoff, blkno); 733 ubh_clrblock (UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
753 if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD) 734 if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
754 ufs_clusteracct (sb, ucpi, blkno, -1); 735 ufs_clusteracct (sb, ucpi, blkno, -1);
755 ret = dquot_alloc_block(inode, uspi->s_fpb);
756 if (ret) {
757 *err = ret;
758 return INVBLOCK;
759 }
760 736
761 fs32_sub(sb, &ucg->cg_cs.cs_nbfree, 1); 737 fs32_sub(sb, &ucg->cg_cs.cs_nbfree, 1);
762 uspi->cs_total.cs_nbfree--; 738 uspi->cs_total.cs_nbfree--;
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 317a0d444f6b..ec784756dc65 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -666,6 +666,6 @@ not_empty:
666const struct file_operations ufs_dir_operations = { 666const struct file_operations ufs_dir_operations = {
667 .read = generic_read_dir, 667 .read = generic_read_dir,
668 .readdir = ufs_readdir, 668 .readdir = ufs_readdir,
669 .fsync = simple_fsync, 669 .fsync = generic_file_fsync,
670 .llseek = generic_file_llseek, 670 .llseek = generic_file_llseek,
671}; 671};
diff --git a/fs/ufs/file.c b/fs/ufs/file.c
index a8962cecde5b..33afa20d4509 100644
--- a/fs/ufs/file.c
+++ b/fs/ufs/file.c
@@ -24,7 +24,6 @@
24 */ 24 */
25 25
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/quotaops.h>
28 27
29#include "ufs_fs.h" 28#include "ufs_fs.h"
30#include "ufs.h" 29#include "ufs.h"
@@ -41,7 +40,7 @@ const struct file_operations ufs_file_operations = {
41 .write = do_sync_write, 40 .write = do_sync_write,
42 .aio_write = generic_file_aio_write, 41 .aio_write = generic_file_aio_write,
43 .mmap = generic_file_mmap, 42 .mmap = generic_file_mmap,
44 .open = dquot_file_open, 43 .open = generic_file_open,
45 .fsync = simple_fsync, 44 .fsync = generic_file_fsync,
46 .splice_read = generic_file_splice_read, 45 .splice_read = generic_file_splice_read,
47}; 46};
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 230ecf608026..594480e537d2 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -27,7 +27,6 @@
27#include <linux/time.h> 27#include <linux/time.h>
28#include <linux/stat.h> 28#include <linux/stat.h>
29#include <linux/string.h> 29#include <linux/string.h>
30#include <linux/quotaops.h>
31#include <linux/buffer_head.h> 30#include <linux/buffer_head.h>
32#include <linux/sched.h> 31#include <linux/sched.h>
33#include <linux/bitops.h> 32#include <linux/bitops.h>
@@ -95,9 +94,6 @@ void ufs_free_inode (struct inode * inode)
95 94
96 is_directory = S_ISDIR(inode->i_mode); 95 is_directory = S_ISDIR(inode->i_mode);
97 96
98 dquot_free_inode(inode);
99 dquot_drop(inode);
100
101 clear_inode (inode); 97 clear_inode (inode);
102 98
103 if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit)) 99 if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit))
@@ -303,15 +299,7 @@ cg_found:
303 sb->s_dirt = 1; 299 sb->s_dirt = 1;
304 300
305 inode->i_ino = cg * uspi->s_ipg + bit; 301 inode->i_ino = cg * uspi->s_ipg + bit;
306 inode->i_mode = mode; 302 inode_init_owner(inode, dir, mode);
307 inode->i_uid = current_fsuid();
308 if (dir->i_mode & S_ISGID) {
309 inode->i_gid = dir->i_gid;
310 if (S_ISDIR(mode))
311 inode->i_mode |= S_ISGID;
312 } else
313 inode->i_gid = current_fsgid();
314
315 inode->i_blocks = 0; 303 inode->i_blocks = 0;
316 inode->i_generation = 0; 304 inode->i_generation = 0;
317 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; 305 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
@@ -355,21 +343,12 @@ cg_found:
355 343
356 unlock_super (sb); 344 unlock_super (sb);
357 345
358 dquot_initialize(inode);
359 err = dquot_alloc_inode(inode);
360 if (err) {
361 dquot_drop(inode);
362 goto fail_without_unlock;
363 }
364
365 UFSD("allocating inode %lu\n", inode->i_ino); 346 UFSD("allocating inode %lu\n", inode->i_ino);
366 UFSD("EXIT\n"); 347 UFSD("EXIT\n");
367 return inode; 348 return inode;
368 349
369fail_remove_inode: 350fail_remove_inode:
370 unlock_super(sb); 351 unlock_super(sb);
371fail_without_unlock:
372 inode->i_flags |= S_NOQUOTA;
373 inode->i_nlink = 0; 352 inode->i_nlink = 0;
374 iput(inode); 353 iput(inode);
375 UFSD("EXIT (FAILED): err %d\n", err); 354 UFSD("EXIT (FAILED): err %d\n", err);
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 80b68c3702d1..73fe773aa034 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -37,7 +37,6 @@
37#include <linux/smp_lock.h> 37#include <linux/smp_lock.h>
38#include <linux/buffer_head.h> 38#include <linux/buffer_head.h>
39#include <linux/writeback.h> 39#include <linux/writeback.h>
40#include <linux/quotaops.h>
41 40
42#include "ufs_fs.h" 41#include "ufs_fs.h"
43#include "ufs.h" 42#include "ufs.h"
@@ -603,7 +602,7 @@ static void ufs_set_inode_ops(struct inode *inode)
603 if (!inode->i_blocks) 602 if (!inode->i_blocks)
604 inode->i_op = &ufs_fast_symlink_inode_operations; 603 inode->i_op = &ufs_fast_symlink_inode_operations;
605 else { 604 else {
606 inode->i_op = &page_symlink_inode_operations; 605 inode->i_op = &ufs_symlink_inode_operations;
607 inode->i_mapping->a_ops = &ufs_aops; 606 inode->i_mapping->a_ops = &ufs_aops;
608 } 607 }
609 } else 608 } else
@@ -910,9 +909,6 @@ void ufs_delete_inode (struct inode * inode)
910{ 909{
911 loff_t old_i_size; 910 loff_t old_i_size;
912 911
913 if (!is_bad_inode(inode))
914 dquot_initialize(inode);
915
916 truncate_inode_pages(&inode->i_data, 0); 912 truncate_inode_pages(&inode->i_data, 0);
917 if (is_bad_inode(inode)) 913 if (is_bad_inode(inode))
918 goto no_delete; 914 goto no_delete;
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 118556243e7a..b056f02b1fb3 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -30,7 +30,6 @@
30#include <linux/time.h> 30#include <linux/time.h>
31#include <linux/fs.h> 31#include <linux/fs.h>
32#include <linux/smp_lock.h> 32#include <linux/smp_lock.h>
33#include <linux/quotaops.h>
34 33
35#include "ufs_fs.h" 34#include "ufs_fs.h"
36#include "ufs.h" 35#include "ufs.h"
@@ -86,8 +85,6 @@ static int ufs_create (struct inode * dir, struct dentry * dentry, int mode,
86 85
87 UFSD("BEGIN\n"); 86 UFSD("BEGIN\n");
88 87
89 dquot_initialize(dir);
90
91 inode = ufs_new_inode(dir, mode); 88 inode = ufs_new_inode(dir, mode);
92 err = PTR_ERR(inode); 89 err = PTR_ERR(inode);
93 90
@@ -112,8 +109,6 @@ static int ufs_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_t
112 if (!old_valid_dev(rdev)) 109 if (!old_valid_dev(rdev))
113 return -EINVAL; 110 return -EINVAL;
114 111
115 dquot_initialize(dir);
116
117 inode = ufs_new_inode(dir, mode); 112 inode = ufs_new_inode(dir, mode);
118 err = PTR_ERR(inode); 113 err = PTR_ERR(inode);
119 if (!IS_ERR(inode)) { 114 if (!IS_ERR(inode)) {
@@ -138,8 +133,6 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry,
138 if (l > sb->s_blocksize) 133 if (l > sb->s_blocksize)
139 goto out_notlocked; 134 goto out_notlocked;
140 135
141 dquot_initialize(dir);
142
143 lock_kernel(); 136 lock_kernel();
144 inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO); 137 inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO);
145 err = PTR_ERR(inode); 138 err = PTR_ERR(inode);
@@ -148,7 +141,7 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry,
148 141
149 if (l > UFS_SB(sb)->s_uspi->s_maxsymlinklen) { 142 if (l > UFS_SB(sb)->s_uspi->s_maxsymlinklen) {
150 /* slow symlink */ 143 /* slow symlink */
151 inode->i_op = &page_symlink_inode_operations; 144 inode->i_op = &ufs_symlink_inode_operations;
152 inode->i_mapping->a_ops = &ufs_aops; 145 inode->i_mapping->a_ops = &ufs_aops;
153 err = page_symlink(inode, symname, l); 146 err = page_symlink(inode, symname, l);
154 if (err) 147 if (err)
@@ -185,8 +178,6 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir,
185 return -EMLINK; 178 return -EMLINK;
186 } 179 }
187 180
188 dquot_initialize(dir);
189
190 inode->i_ctime = CURRENT_TIME_SEC; 181 inode->i_ctime = CURRENT_TIME_SEC;
191 inode_inc_link_count(inode); 182 inode_inc_link_count(inode);
192 atomic_inc(&inode->i_count); 183 atomic_inc(&inode->i_count);
@@ -204,8 +195,6 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, int mode)
204 if (dir->i_nlink >= UFS_LINK_MAX) 195 if (dir->i_nlink >= UFS_LINK_MAX)
205 goto out; 196 goto out;
206 197
207 dquot_initialize(dir);
208
209 lock_kernel(); 198 lock_kernel();
210 inode_inc_link_count(dir); 199 inode_inc_link_count(dir);
211 200
@@ -250,8 +239,6 @@ static int ufs_unlink(struct inode *dir, struct dentry *dentry)
250 struct page *page; 239 struct page *page;
251 int err = -ENOENT; 240 int err = -ENOENT;
252 241
253 dquot_initialize(dir);
254
255 de = ufs_find_entry(dir, &dentry->d_name, &page); 242 de = ufs_find_entry(dir, &dentry->d_name, &page);
256 if (!de) 243 if (!de)
257 goto out; 244 goto out;
@@ -296,9 +283,6 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
296 struct ufs_dir_entry *old_de; 283 struct ufs_dir_entry *old_de;
297 int err = -ENOENT; 284 int err = -ENOENT;
298 285
299 dquot_initialize(old_dir);
300 dquot_initialize(new_dir);
301
302 old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page); 286 old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page);
303 if (!old_de) 287 if (!old_de)
304 goto out; 288 goto out;
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 14743d935a93..3ec5a9eb6efb 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -77,7 +77,6 @@
77 77
78#include <linux/errno.h> 78#include <linux/errno.h>
79#include <linux/fs.h> 79#include <linux/fs.h>
80#include <linux/quotaops.h>
81#include <linux/slab.h> 80#include <linux/slab.h>
82#include <linux/time.h> 81#include <linux/time.h>
83#include <linux/stat.h> 82#include <linux/stat.h>
@@ -918,6 +917,7 @@ again:
918 sbi->s_bytesex = BYTESEX_LE; 917 sbi->s_bytesex = BYTESEX_LE;
919 switch ((uspi->fs_magic = fs32_to_cpu(sb, usb3->fs_magic))) { 918 switch ((uspi->fs_magic = fs32_to_cpu(sb, usb3->fs_magic))) {
920 case UFS_MAGIC: 919 case UFS_MAGIC:
920 case UFS_MAGIC_BW:
921 case UFS2_MAGIC: 921 case UFS2_MAGIC:
922 case UFS_MAGIC_LFN: 922 case UFS_MAGIC_LFN:
923 case UFS_MAGIC_FEA: 923 case UFS_MAGIC_FEA:
@@ -927,6 +927,7 @@ again:
927 sbi->s_bytesex = BYTESEX_BE; 927 sbi->s_bytesex = BYTESEX_BE;
928 switch ((uspi->fs_magic = fs32_to_cpu(sb, usb3->fs_magic))) { 928 switch ((uspi->fs_magic = fs32_to_cpu(sb, usb3->fs_magic))) {
929 case UFS_MAGIC: 929 case UFS_MAGIC:
930 case UFS_MAGIC_BW:
930 case UFS2_MAGIC: 931 case UFS2_MAGIC:
931 case UFS_MAGIC_LFN: 932 case UFS_MAGIC_LFN:
932 case UFS_MAGIC_FEA: 933 case UFS_MAGIC_FEA:
@@ -1045,7 +1046,7 @@ magic_found:
1045 */ 1046 */
1046 sb->s_op = &ufs_super_ops; 1047 sb->s_op = &ufs_super_ops;
1047 sb->s_export_op = &ufs_export_ops; 1048 sb->s_export_op = &ufs_export_ops;
1048 sb->dq_op = NULL; /***/ 1049
1049 sb->s_magic = fs32_to_cpu(sb, usb3->fs_magic); 1050 sb->s_magic = fs32_to_cpu(sb, usb3->fs_magic);
1050 1051
1051 uspi->s_sblkno = fs32_to_cpu(sb, usb1->fs_sblkno); 1052 uspi->s_sblkno = fs32_to_cpu(sb, usb1->fs_sblkno);
@@ -1435,126 +1436,19 @@ static void destroy_inodecache(void)
1435 kmem_cache_destroy(ufs_inode_cachep); 1436 kmem_cache_destroy(ufs_inode_cachep);
1436} 1437}
1437 1438
1438static void ufs_clear_inode(struct inode *inode)
1439{
1440 dquot_drop(inode);
1441}
1442
1443#ifdef CONFIG_QUOTA
1444static ssize_t ufs_quota_read(struct super_block *, int, char *,size_t, loff_t);
1445static ssize_t ufs_quota_write(struct super_block *, int, const char *, size_t, loff_t);
1446#endif
1447
1448static const struct super_operations ufs_super_ops = { 1439static const struct super_operations ufs_super_ops = {
1449 .alloc_inode = ufs_alloc_inode, 1440 .alloc_inode = ufs_alloc_inode,
1450 .destroy_inode = ufs_destroy_inode, 1441 .destroy_inode = ufs_destroy_inode,
1451 .write_inode = ufs_write_inode, 1442 .write_inode = ufs_write_inode,
1452 .delete_inode = ufs_delete_inode, 1443 .delete_inode = ufs_delete_inode,
1453 .clear_inode = ufs_clear_inode,
1454 .put_super = ufs_put_super, 1444 .put_super = ufs_put_super,
1455 .write_super = ufs_write_super, 1445 .write_super = ufs_write_super,
1456 .sync_fs = ufs_sync_fs, 1446 .sync_fs = ufs_sync_fs,
1457 .statfs = ufs_statfs, 1447 .statfs = ufs_statfs,
1458 .remount_fs = ufs_remount, 1448 .remount_fs = ufs_remount,
1459 .show_options = ufs_show_options, 1449 .show_options = ufs_show_options,
1460#ifdef CONFIG_QUOTA
1461 .quota_read = ufs_quota_read,
1462 .quota_write = ufs_quota_write,
1463#endif
1464}; 1450};
1465 1451
1466#ifdef CONFIG_QUOTA
1467
1468/* Read data from quotafile - avoid pagecache and such because we cannot afford
1469 * acquiring the locks... As quota files are never truncated and quota code
1470 * itself serializes the operations (and noone else should touch the files)
1471 * we don't have to be afraid of races */
1472static ssize_t ufs_quota_read(struct super_block *sb, int type, char *data,
1473 size_t len, loff_t off)
1474{
1475 struct inode *inode = sb_dqopt(sb)->files[type];
1476 sector_t blk = off >> sb->s_blocksize_bits;
1477 int err = 0;
1478 int offset = off & (sb->s_blocksize - 1);
1479 int tocopy;
1480 size_t toread;
1481 struct buffer_head *bh;
1482 loff_t i_size = i_size_read(inode);
1483
1484 if (off > i_size)
1485 return 0;
1486 if (off+len > i_size)
1487 len = i_size-off;
1488 toread = len;
1489 while (toread > 0) {
1490 tocopy = sb->s_blocksize - offset < toread ?
1491 sb->s_blocksize - offset : toread;
1492
1493 bh = ufs_bread(inode, blk, 0, &err);
1494 if (err)
1495 return err;
1496 if (!bh) /* A hole? */
1497 memset(data, 0, tocopy);
1498 else {
1499 memcpy(data, bh->b_data+offset, tocopy);
1500 brelse(bh);
1501 }
1502 offset = 0;
1503 toread -= tocopy;
1504 data += tocopy;
1505 blk++;
1506 }
1507 return len;
1508}
1509
1510/* Write to quotafile */
1511static ssize_t ufs_quota_write(struct super_block *sb, int type,
1512 const char *data, size_t len, loff_t off)
1513{
1514 struct inode *inode = sb_dqopt(sb)->files[type];
1515 sector_t blk = off >> sb->s_blocksize_bits;
1516 int err = 0;
1517 int offset = off & (sb->s_blocksize - 1);
1518 int tocopy;
1519 size_t towrite = len;
1520 struct buffer_head *bh;
1521
1522 mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
1523 while (towrite > 0) {
1524 tocopy = sb->s_blocksize - offset < towrite ?
1525 sb->s_blocksize - offset : towrite;
1526
1527 bh = ufs_bread(inode, blk, 1, &err);
1528 if (!bh)
1529 goto out;
1530 lock_buffer(bh);
1531 memcpy(bh->b_data+offset, data, tocopy);
1532 flush_dcache_page(bh->b_page);
1533 set_buffer_uptodate(bh);
1534 mark_buffer_dirty(bh);
1535 unlock_buffer(bh);
1536 brelse(bh);
1537 offset = 0;
1538 towrite -= tocopy;
1539 data += tocopy;
1540 blk++;
1541 }
1542out:
1543 if (len == towrite) {
1544 mutex_unlock(&inode->i_mutex);
1545 return err;
1546 }
1547 if (inode->i_size < off+len-towrite)
1548 i_size_write(inode, off+len-towrite);
1549 inode->i_version++;
1550 inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
1551 mark_inode_dirty(inode);
1552 mutex_unlock(&inode->i_mutex);
1553 return len - towrite;
1554}
1555
1556#endif
1557
1558static int ufs_get_sb(struct file_system_type *fs_type, 1452static int ufs_get_sb(struct file_system_type *fs_type,
1559 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 1453 int flags, const char *dev_name, void *data, struct vfsmount *mnt)
1560{ 1454{
diff --git a/fs/ufs/symlink.c b/fs/ufs/symlink.c
index c0156eda44bc..d283628b4778 100644
--- a/fs/ufs/symlink.c
+++ b/fs/ufs/symlink.c
@@ -42,4 +42,12 @@ static void *ufs_follow_link(struct dentry *dentry, struct nameidata *nd)
42const struct inode_operations ufs_fast_symlink_inode_operations = { 42const struct inode_operations ufs_fast_symlink_inode_operations = {
43 .readlink = generic_readlink, 43 .readlink = generic_readlink,
44 .follow_link = ufs_follow_link, 44 .follow_link = ufs_follow_link,
45 .setattr = ufs_setattr,
46};
47
48const struct inode_operations ufs_symlink_inode_operations = {
49 .readlink = generic_readlink,
50 .follow_link = page_follow_link_light,
51 .put_link = page_put_link,
52 .setattr = ufs_setattr,
45}; 53};
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index d3b6270cb377..589e01a465ba 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -44,7 +44,6 @@
44#include <linux/buffer_head.h> 44#include <linux/buffer_head.h>
45#include <linux/blkdev.h> 45#include <linux/blkdev.h>
46#include <linux/sched.h> 46#include <linux/sched.h>
47#include <linux/quotaops.h>
48 47
49#include "ufs_fs.h" 48#include "ufs_fs.h"
50#include "ufs.h" 49#include "ufs.h"
@@ -501,14 +500,12 @@ out:
501 return err; 500 return err;
502} 501}
503 502
504
505/* 503/*
506 * We don't define our `inode->i_op->truncate', and call it here, 504 * TODO:
507 * because of: 505 * - truncate case should use proper ordering instead of using
508 * - there is no way to know old size 506 * simple_setsize
509 * - there is no way inform user about error, if it happens in `truncate'
510 */ 507 */
511static int ufs_setattr(struct dentry *dentry, struct iattr *attr) 508int ufs_setattr(struct dentry *dentry, struct iattr *attr)
512{ 509{
513 struct inode *inode = dentry->d_inode; 510 struct inode *inode = dentry->d_inode;
514 unsigned int ia_valid = attr->ia_valid; 511 unsigned int ia_valid = attr->ia_valid;
@@ -518,19 +515,10 @@ static int ufs_setattr(struct dentry *dentry, struct iattr *attr)
518 if (error) 515 if (error)
519 return error; 516 return error;
520 517
521 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || 518 if (ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) {
522 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
523 error = dquot_transfer(inode, attr);
524 if (error)
525 return error;
526 }
527 if (ia_valid & ATTR_SIZE &&
528 attr->ia_size != i_size_read(inode)) {
529 loff_t old_i_size = inode->i_size; 519 loff_t old_i_size = inode->i_size;
530 520
531 dquot_initialize(inode); 521 error = simple_setsize(inode, attr->ia_size);
532
533 error = vmtruncate(inode, attr->ia_size);
534 if (error) 522 if (error)
535 return error; 523 return error;
536 error = ufs_truncate(inode, old_i_size); 524 error = ufs_truncate(inode, old_i_size);
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index 43f9f5d5670e..179ae6b3180a 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -122,9 +122,11 @@ extern void ufs_panic (struct super_block *, const char *, const char *, ...) __
122 122
123/* symlink.c */ 123/* symlink.c */
124extern const struct inode_operations ufs_fast_symlink_inode_operations; 124extern const struct inode_operations ufs_fast_symlink_inode_operations;
125extern const struct inode_operations ufs_symlink_inode_operations;
125 126
126/* truncate.c */ 127/* truncate.c */
127extern int ufs_truncate (struct inode *, loff_t); 128extern int ufs_truncate (struct inode *, loff_t);
129extern int ufs_setattr(struct dentry *dentry, struct iattr *attr);
128 130
129static inline struct ufs_sb_info *UFS_SB(struct super_block *sb) 131static inline struct ufs_sb_info *UFS_SB(struct super_block *sb)
130{ 132{
diff --git a/fs/ufs/ufs_fs.h b/fs/ufs/ufs_fs.h
index 6943ec677c0b..8aba544f9fad 100644
--- a/fs/ufs/ufs_fs.h
+++ b/fs/ufs/ufs_fs.h
@@ -48,6 +48,7 @@ typedef __u16 __bitwise __fs16;
48#define UFS_SECTOR_SIZE 512 48#define UFS_SECTOR_SIZE 512
49#define UFS_SECTOR_BITS 9 49#define UFS_SECTOR_BITS 9
50#define UFS_MAGIC 0x00011954 50#define UFS_MAGIC 0x00011954
51#define UFS_MAGIC_BW 0x0f242697
51#define UFS2_MAGIC 0x19540119 52#define UFS2_MAGIC 0x19540119
52#define UFS_CIGAM 0x54190100 /* byteswapped MAGIC */ 53#define UFS_CIGAM 0x54190100 /* byteswapped MAGIC */
53 54
diff --git a/fs/xattr.c b/fs/xattr.c
index 46f87e828b48..01bb8135e14a 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -590,10 +590,10 @@ strcmp_prefix(const char *a, const char *a_prefix)
590/* 590/*
591 * Find the xattr_handler with the matching prefix. 591 * Find the xattr_handler with the matching prefix.
592 */ 592 */
593static struct xattr_handler * 593static const struct xattr_handler *
594xattr_resolve_name(struct xattr_handler **handlers, const char **name) 594xattr_resolve_name(const struct xattr_handler **handlers, const char **name)
595{ 595{
596 struct xattr_handler *handler; 596 const struct xattr_handler *handler;
597 597
598 if (!*name) 598 if (!*name)
599 return NULL; 599 return NULL;
@@ -614,7 +614,7 @@ xattr_resolve_name(struct xattr_handler **handlers, const char **name)
614ssize_t 614ssize_t
615generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size) 615generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size)
616{ 616{
617 struct xattr_handler *handler; 617 const struct xattr_handler *handler;
618 618
619 handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name); 619 handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
620 if (!handler) 620 if (!handler)
@@ -629,7 +629,7 @@ generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t s
629ssize_t 629ssize_t
630generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) 630generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
631{ 631{
632 struct xattr_handler *handler, **handlers = dentry->d_sb->s_xattr; 632 const struct xattr_handler *handler, **handlers = dentry->d_sb->s_xattr;
633 unsigned int size = 0; 633 unsigned int size = 0;
634 634
635 if (!buffer) { 635 if (!buffer) {
@@ -659,7 +659,7 @@ generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
659int 659int
660generic_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) 660generic_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags)
661{ 661{
662 struct xattr_handler *handler; 662 const struct xattr_handler *handler;
663 663
664 if (size == 0) 664 if (size == 0)
665 value = ""; /* empty EA, do not remove */ 665 value = ""; /* empty EA, do not remove */
@@ -676,7 +676,7 @@ generic_setxattr(struct dentry *dentry, const char *name, const void *value, siz
676int 676int
677generic_removexattr(struct dentry *dentry, const char *name) 677generic_removexattr(struct dentry *dentry, const char *name)
678{ 678{
679 struct xattr_handler *handler; 679 const struct xattr_handler *handler;
680 680
681 handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name); 681 handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
682 if (!handler) 682 if (!handler)
diff --git a/fs/xattr_acl.c b/fs/xattr_acl.c
index 05ac0fe9c4d3..8d5a506c82eb 100644
--- a/fs/xattr_acl.c
+++ b/fs/xattr_acl.c
@@ -6,9 +6,9 @@
6 */ 6 */
7 7
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/slab.h>
10#include <linux/fs.h> 9#include <linux/fs.h>
11#include <linux/posix_acl_xattr.h> 10#include <linux/posix_acl_xattr.h>
11#include <linux/gfp.h>
12 12
13 13
14/* 14/*
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index b4769e40e8bc..c8fb13f83b3f 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -77,6 +77,7 @@ xfs-y += xfs_alloc.o \
77 xfs_itable.o \ 77 xfs_itable.o \
78 xfs_dfrag.o \ 78 xfs_dfrag.o \
79 xfs_log.o \ 79 xfs_log.o \
80 xfs_log_cil.o \
80 xfs_log_recover.o \ 81 xfs_log_recover.o \
81 xfs_mount.o \ 82 xfs_mount.o \
82 xfs_mru_cache.o \ 83 xfs_mru_cache.o \
diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c
index bc7405585def..666c9db48eb6 100644
--- a/fs/xfs/linux-2.6/kmem.c
+++ b/fs/xfs/linux-2.6/kmem.c
@@ -17,6 +17,7 @@
17 */ 17 */
18#include <linux/mm.h> 18#include <linux/mm.h>
19#include <linux/highmem.h> 19#include <linux/highmem.h>
20#include <linux/slab.h>
20#include <linux/swap.h> 21#include <linux/swap.h>
21#include <linux/blkdev.h> 22#include <linux/blkdev.h>
22#include <linux/backing-dev.h> 23#include <linux/backing-dev.h>
diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c
index bf85bbe4a9ae..9f769b5b38fc 100644
--- a/fs/xfs/linux-2.6/xfs_acl.c
+++ b/fs/xfs/linux-2.6/xfs_acl.c
@@ -22,6 +22,7 @@
22#include "xfs_inode.h" 22#include "xfs_inode.h"
23#include "xfs_vnodeops.h" 23#include "xfs_vnodeops.h"
24#include "xfs_trace.h" 24#include "xfs_trace.h"
25#include <linux/slab.h>
25#include <linux/xattr.h> 26#include <linux/xattr.h>
26#include <linux/posix_acl_xattr.h> 27#include <linux/posix_acl_xattr.h>
27 28
@@ -439,14 +440,14 @@ xfs_xattr_acl_set(struct dentry *dentry, const char *name,
439 return error; 440 return error;
440} 441}
441 442
442struct xattr_handler xfs_xattr_acl_access_handler = { 443const struct xattr_handler xfs_xattr_acl_access_handler = {
443 .prefix = POSIX_ACL_XATTR_ACCESS, 444 .prefix = POSIX_ACL_XATTR_ACCESS,
444 .flags = ACL_TYPE_ACCESS, 445 .flags = ACL_TYPE_ACCESS,
445 .get = xfs_xattr_acl_get, 446 .get = xfs_xattr_acl_get,
446 .set = xfs_xattr_acl_set, 447 .set = xfs_xattr_acl_set,
447}; 448};
448 449
449struct xattr_handler xfs_xattr_acl_default_handler = { 450const struct xattr_handler xfs_xattr_acl_default_handler = {
450 .prefix = POSIX_ACL_XATTR_DEFAULT, 451 .prefix = POSIX_ACL_XATTR_DEFAULT,
451 .flags = ACL_TYPE_DEFAULT, 452 .flags = ACL_TYPE_DEFAULT,
452 .get = xfs_xattr_acl_get, 453 .get = xfs_xattr_acl_get,
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 99628508cb11..089eaca860b4 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -40,10 +40,20 @@
40#include "xfs_vnodeops.h" 40#include "xfs_vnodeops.h"
41#include "xfs_trace.h" 41#include "xfs_trace.h"
42#include "xfs_bmap.h" 42#include "xfs_bmap.h"
43#include <linux/gfp.h>
43#include <linux/mpage.h> 44#include <linux/mpage.h>
44#include <linux/pagevec.h> 45#include <linux/pagevec.h>
45#include <linux/writeback.h> 46#include <linux/writeback.h>
46 47
48/*
49 * Types of I/O for bmap clustering and I/O completion tracking.
50 */
51enum {
52 IO_READ, /* mapping for a read */
53 IO_DELAY, /* mapping covers delalloc region */
54 IO_UNWRITTEN, /* mapping covers allocated but uninitialized data */
55 IO_NEW /* just allocated */
56};
47 57
48/* 58/*
49 * Prime number of hash buckets since address is used as the key. 59 * Prime number of hash buckets since address is used as the key.
@@ -102,8 +112,9 @@ xfs_count_page_state(
102 112
103STATIC struct block_device * 113STATIC struct block_device *
104xfs_find_bdev_for_inode( 114xfs_find_bdev_for_inode(
105 struct xfs_inode *ip) 115 struct inode *inode)
106{ 116{
117 struct xfs_inode *ip = XFS_I(inode);
107 struct xfs_mount *mp = ip->i_mount; 118 struct xfs_mount *mp = ip->i_mount;
108 119
109 if (XFS_IS_REALTIME_INODE(ip)) 120 if (XFS_IS_REALTIME_INODE(ip))
@@ -182,7 +193,7 @@ xfs_setfilesize(
182 xfs_fsize_t isize; 193 xfs_fsize_t isize;
183 194
184 ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG); 195 ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
185 ASSERT(ioend->io_type != IOMAP_READ); 196 ASSERT(ioend->io_type != IO_READ);
186 197
187 if (unlikely(ioend->io_error)) 198 if (unlikely(ioend->io_error))
188 return 0; 199 return 0;
@@ -213,7 +224,7 @@ xfs_finish_ioend(
213 if (atomic_dec_and_test(&ioend->io_remaining)) { 224 if (atomic_dec_and_test(&ioend->io_remaining)) {
214 struct workqueue_struct *wq; 225 struct workqueue_struct *wq;
215 226
216 wq = (ioend->io_type == IOMAP_UNWRITTEN) ? 227 wq = (ioend->io_type == IO_UNWRITTEN) ?
217 xfsconvertd_workqueue : xfsdatad_workqueue; 228 xfsconvertd_workqueue : xfsdatad_workqueue;
218 queue_work(wq, &ioend->io_work); 229 queue_work(wq, &ioend->io_work);
219 if (wait) 230 if (wait)
@@ -236,7 +247,7 @@ xfs_end_io(
236 * For unwritten extents we need to issue transactions to convert a 247 * For unwritten extents we need to issue transactions to convert a
237 * range to normal written extens after the data I/O has finished. 248 * range to normal written extens after the data I/O has finished.
238 */ 249 */
239 if (ioend->io_type == IOMAP_UNWRITTEN && 250 if (ioend->io_type == IO_UNWRITTEN &&
240 likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) { 251 likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) {
241 252
242 error = xfs_iomap_write_unwritten(ip, ioend->io_offset, 253 error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
@@ -249,7 +260,7 @@ xfs_end_io(
249 * We might have to update the on-disk file size after extending 260 * We might have to update the on-disk file size after extending
250 * writes. 261 * writes.
251 */ 262 */
252 if (ioend->io_type != IOMAP_READ) { 263 if (ioend->io_type != IO_READ) {
253 error = xfs_setfilesize(ioend); 264 error = xfs_setfilesize(ioend);
254 ASSERT(!error || error == EAGAIN); 265 ASSERT(!error || error == EAGAIN);
255 } 266 }
@@ -308,21 +319,25 @@ xfs_map_blocks(
308 struct inode *inode, 319 struct inode *inode,
309 loff_t offset, 320 loff_t offset,
310 ssize_t count, 321 ssize_t count,
311 xfs_iomap_t *mapp, 322 struct xfs_bmbt_irec *imap,
312 int flags) 323 int flags)
313{ 324{
314 int nmaps = 1; 325 int nmaps = 1;
326 int new = 0;
315 327
316 return -xfs_iomap(XFS_I(inode), offset, count, flags, mapp, &nmaps); 328 return -xfs_iomap(XFS_I(inode), offset, count, flags, imap, &nmaps, &new);
317} 329}
318 330
319STATIC int 331STATIC int
320xfs_iomap_valid( 332xfs_imap_valid(
321 xfs_iomap_t *iomapp, 333 struct inode *inode,
322 loff_t offset) 334 struct xfs_bmbt_irec *imap,
335 xfs_off_t offset)
323{ 336{
324 return offset >= iomapp->iomap_offset && 337 offset >>= inode->i_blkbits;
325 offset < iomapp->iomap_offset + iomapp->iomap_bsize; 338
339 return offset >= imap->br_startoff &&
340 offset < imap->br_startoff + imap->br_blockcount;
326} 341}
327 342
328/* 343/*
@@ -553,19 +568,23 @@ xfs_add_to_ioend(
553 568
554STATIC void 569STATIC void
555xfs_map_buffer( 570xfs_map_buffer(
571 struct inode *inode,
556 struct buffer_head *bh, 572 struct buffer_head *bh,
557 xfs_iomap_t *mp, 573 struct xfs_bmbt_irec *imap,
558 xfs_off_t offset, 574 xfs_off_t offset)
559 uint block_bits)
560{ 575{
561 sector_t bn; 576 sector_t bn;
577 struct xfs_mount *m = XFS_I(inode)->i_mount;
578 xfs_off_t iomap_offset = XFS_FSB_TO_B(m, imap->br_startoff);
579 xfs_daddr_t iomap_bn = xfs_fsb_to_db(XFS_I(inode), imap->br_startblock);
562 580
563 ASSERT(mp->iomap_bn != IOMAP_DADDR_NULL); 581 ASSERT(imap->br_startblock != HOLESTARTBLOCK);
582 ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
564 583
565 bn = (mp->iomap_bn >> (block_bits - BBSHIFT)) + 584 bn = (iomap_bn >> (inode->i_blkbits - BBSHIFT)) +
566 ((offset - mp->iomap_offset) >> block_bits); 585 ((offset - iomap_offset) >> inode->i_blkbits);
567 586
568 ASSERT(bn || (mp->iomap_flags & IOMAP_REALTIME)); 587 ASSERT(bn || XFS_IS_REALTIME_INODE(XFS_I(inode)));
569 588
570 bh->b_blocknr = bn; 589 bh->b_blocknr = bn;
571 set_buffer_mapped(bh); 590 set_buffer_mapped(bh);
@@ -573,17 +592,17 @@ xfs_map_buffer(
573 592
574STATIC void 593STATIC void
575xfs_map_at_offset( 594xfs_map_at_offset(
595 struct inode *inode,
576 struct buffer_head *bh, 596 struct buffer_head *bh,
577 loff_t offset, 597 struct xfs_bmbt_irec *imap,
578 int block_bits, 598 xfs_off_t offset)
579 xfs_iomap_t *iomapp)
580{ 599{
581 ASSERT(!(iomapp->iomap_flags & IOMAP_HOLE)); 600 ASSERT(imap->br_startblock != HOLESTARTBLOCK);
582 ASSERT(!(iomapp->iomap_flags & IOMAP_DELAY)); 601 ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
583 602
584 lock_buffer(bh); 603 lock_buffer(bh);
585 xfs_map_buffer(bh, iomapp, offset, block_bits); 604 xfs_map_buffer(inode, bh, imap, offset);
586 bh->b_bdev = iomapp->iomap_target->bt_bdev; 605 bh->b_bdev = xfs_find_bdev_for_inode(inode);
587 set_buffer_mapped(bh); 606 set_buffer_mapped(bh);
588 clear_buffer_delay(bh); 607 clear_buffer_delay(bh);
589 clear_buffer_unwritten(bh); 608 clear_buffer_unwritten(bh);
@@ -712,11 +731,11 @@ xfs_is_delayed_page(
712 bh = head = page_buffers(page); 731 bh = head = page_buffers(page);
713 do { 732 do {
714 if (buffer_unwritten(bh)) 733 if (buffer_unwritten(bh))
715 acceptable = (type == IOMAP_UNWRITTEN); 734 acceptable = (type == IO_UNWRITTEN);
716 else if (buffer_delay(bh)) 735 else if (buffer_delay(bh))
717 acceptable = (type == IOMAP_DELAY); 736 acceptable = (type == IO_DELAY);
718 else if (buffer_dirty(bh) && buffer_mapped(bh)) 737 else if (buffer_dirty(bh) && buffer_mapped(bh))
719 acceptable = (type == IOMAP_NEW); 738 acceptable = (type == IO_NEW);
720 else 739 else
721 break; 740 break;
722 } while ((bh = bh->b_this_page) != head); 741 } while ((bh = bh->b_this_page) != head);
@@ -739,7 +758,7 @@ xfs_convert_page(
739 struct inode *inode, 758 struct inode *inode,
740 struct page *page, 759 struct page *page,
741 loff_t tindex, 760 loff_t tindex,
742 xfs_iomap_t *mp, 761 struct xfs_bmbt_irec *imap,
743 xfs_ioend_t **ioendp, 762 xfs_ioend_t **ioendp,
744 struct writeback_control *wbc, 763 struct writeback_control *wbc,
745 int startio, 764 int startio,
@@ -749,7 +768,6 @@ xfs_convert_page(
749 xfs_off_t end_offset; 768 xfs_off_t end_offset;
750 unsigned long p_offset; 769 unsigned long p_offset;
751 unsigned int type; 770 unsigned int type;
752 int bbits = inode->i_blkbits;
753 int len, page_dirty; 771 int len, page_dirty;
754 int count = 0, done = 0, uptodate = 1; 772 int count = 0, done = 0, uptodate = 1;
755 xfs_off_t offset = page_offset(page); 773 xfs_off_t offset = page_offset(page);
@@ -801,19 +819,19 @@ xfs_convert_page(
801 819
802 if (buffer_unwritten(bh) || buffer_delay(bh)) { 820 if (buffer_unwritten(bh) || buffer_delay(bh)) {
803 if (buffer_unwritten(bh)) 821 if (buffer_unwritten(bh))
804 type = IOMAP_UNWRITTEN; 822 type = IO_UNWRITTEN;
805 else 823 else
806 type = IOMAP_DELAY; 824 type = IO_DELAY;
807 825
808 if (!xfs_iomap_valid(mp, offset)) { 826 if (!xfs_imap_valid(inode, imap, offset)) {
809 done = 1; 827 done = 1;
810 continue; 828 continue;
811 } 829 }
812 830
813 ASSERT(!(mp->iomap_flags & IOMAP_HOLE)); 831 ASSERT(imap->br_startblock != HOLESTARTBLOCK);
814 ASSERT(!(mp->iomap_flags & IOMAP_DELAY)); 832 ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
815 833
816 xfs_map_at_offset(bh, offset, bbits, mp); 834 xfs_map_at_offset(inode, bh, imap, offset);
817 if (startio) { 835 if (startio) {
818 xfs_add_to_ioend(inode, bh, offset, 836 xfs_add_to_ioend(inode, bh, offset,
819 type, ioendp, done); 837 type, ioendp, done);
@@ -825,7 +843,7 @@ xfs_convert_page(
825 page_dirty--; 843 page_dirty--;
826 count++; 844 count++;
827 } else { 845 } else {
828 type = IOMAP_NEW; 846 type = IO_NEW;
829 if (buffer_mapped(bh) && all_bh && startio) { 847 if (buffer_mapped(bh) && all_bh && startio) {
830 lock_buffer(bh); 848 lock_buffer(bh);
831 xfs_add_to_ioend(inode, bh, offset, 849 xfs_add_to_ioend(inode, bh, offset,
@@ -865,7 +883,7 @@ STATIC void
865xfs_cluster_write( 883xfs_cluster_write(
866 struct inode *inode, 884 struct inode *inode,
867 pgoff_t tindex, 885 pgoff_t tindex,
868 xfs_iomap_t *iomapp, 886 struct xfs_bmbt_irec *imap,
869 xfs_ioend_t **ioendp, 887 xfs_ioend_t **ioendp,
870 struct writeback_control *wbc, 888 struct writeback_control *wbc,
871 int startio, 889 int startio,
@@ -884,7 +902,7 @@ xfs_cluster_write(
884 902
885 for (i = 0; i < pagevec_count(&pvec); i++) { 903 for (i = 0; i < pagevec_count(&pvec); i++) {
886 done = xfs_convert_page(inode, pvec.pages[i], tindex++, 904 done = xfs_convert_page(inode, pvec.pages[i], tindex++,
887 iomapp, ioendp, wbc, startio, all_bh); 905 imap, ioendp, wbc, startio, all_bh);
888 if (done) 906 if (done)
889 break; 907 break;
890 } 908 }
@@ -929,7 +947,7 @@ xfs_aops_discard_page(
929 loff_t offset = page_offset(page); 947 loff_t offset = page_offset(page);
930 ssize_t len = 1 << inode->i_blkbits; 948 ssize_t len = 1 << inode->i_blkbits;
931 949
932 if (!xfs_is_delayed_page(page, IOMAP_DELAY)) 950 if (!xfs_is_delayed_page(page, IO_DELAY))
933 goto out_invalidate; 951 goto out_invalidate;
934 952
935 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 953 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
@@ -1041,15 +1059,15 @@ xfs_page_state_convert(
1041 int unmapped) /* also implies page uptodate */ 1059 int unmapped) /* also implies page uptodate */
1042{ 1060{
1043 struct buffer_head *bh, *head; 1061 struct buffer_head *bh, *head;
1044 xfs_iomap_t iomap; 1062 struct xfs_bmbt_irec imap;
1045 xfs_ioend_t *ioend = NULL, *iohead = NULL; 1063 xfs_ioend_t *ioend = NULL, *iohead = NULL;
1046 loff_t offset; 1064 loff_t offset;
1047 unsigned long p_offset = 0; 1065 unsigned long p_offset = 0;
1048 unsigned int type; 1066 unsigned int type;
1049 __uint64_t end_offset; 1067 __uint64_t end_offset;
1050 pgoff_t end_index, last_index, tlast; 1068 pgoff_t end_index, last_index;
1051 ssize_t size, len; 1069 ssize_t size, len;
1052 int flags, err, iomap_valid = 0, uptodate = 1; 1070 int flags, err, imap_valid = 0, uptodate = 1;
1053 int page_dirty, count = 0; 1071 int page_dirty, count = 0;
1054 int trylock = 0; 1072 int trylock = 0;
1055 int all_bh = unmapped; 1073 int all_bh = unmapped;
@@ -1096,7 +1114,7 @@ xfs_page_state_convert(
1096 bh = head = page_buffers(page); 1114 bh = head = page_buffers(page);
1097 offset = page_offset(page); 1115 offset = page_offset(page);
1098 flags = BMAPI_READ; 1116 flags = BMAPI_READ;
1099 type = IOMAP_NEW; 1117 type = IO_NEW;
1100 1118
1101 /* TODO: cleanup count and page_dirty */ 1119 /* TODO: cleanup count and page_dirty */
1102 1120
@@ -1110,12 +1128,12 @@ xfs_page_state_convert(
1110 * the iomap is actually still valid, but the ioend 1128 * the iomap is actually still valid, but the ioend
1111 * isn't. shouldn't happen too often. 1129 * isn't. shouldn't happen too often.
1112 */ 1130 */
1113 iomap_valid = 0; 1131 imap_valid = 0;
1114 continue; 1132 continue;
1115 } 1133 }
1116 1134
1117 if (iomap_valid) 1135 if (imap_valid)
1118 iomap_valid = xfs_iomap_valid(&iomap, offset); 1136 imap_valid = xfs_imap_valid(inode, &imap, offset);
1119 1137
1120 /* 1138 /*
1121 * First case, map an unwritten extent and prepare for 1139 * First case, map an unwritten extent and prepare for
@@ -1136,20 +1154,20 @@ xfs_page_state_convert(
1136 * Make sure we don't use a read-only iomap 1154 * Make sure we don't use a read-only iomap
1137 */ 1155 */
1138 if (flags == BMAPI_READ) 1156 if (flags == BMAPI_READ)
1139 iomap_valid = 0; 1157 imap_valid = 0;
1140 1158
1141 if (buffer_unwritten(bh)) { 1159 if (buffer_unwritten(bh)) {
1142 type = IOMAP_UNWRITTEN; 1160 type = IO_UNWRITTEN;
1143 flags = BMAPI_WRITE | BMAPI_IGNSTATE; 1161 flags = BMAPI_WRITE | BMAPI_IGNSTATE;
1144 } else if (buffer_delay(bh)) { 1162 } else if (buffer_delay(bh)) {
1145 type = IOMAP_DELAY; 1163 type = IO_DELAY;
1146 flags = BMAPI_ALLOCATE | trylock; 1164 flags = BMAPI_ALLOCATE | trylock;
1147 } else { 1165 } else {
1148 type = IOMAP_NEW; 1166 type = IO_NEW;
1149 flags = BMAPI_WRITE | BMAPI_MMAP; 1167 flags = BMAPI_WRITE | BMAPI_MMAP;
1150 } 1168 }
1151 1169
1152 if (!iomap_valid) { 1170 if (!imap_valid) {
1153 /* 1171 /*
1154 * if we didn't have a valid mapping then we 1172 * if we didn't have a valid mapping then we
1155 * need to ensure that we put the new mapping 1173 * need to ensure that we put the new mapping
@@ -1159,7 +1177,7 @@ xfs_page_state_convert(
1159 * for unwritten extent conversion. 1177 * for unwritten extent conversion.
1160 */ 1178 */
1161 new_ioend = 1; 1179 new_ioend = 1;
1162 if (type == IOMAP_NEW) { 1180 if (type == IO_NEW) {
1163 size = xfs_probe_cluster(inode, 1181 size = xfs_probe_cluster(inode,
1164 page, bh, head, 0); 1182 page, bh, head, 0);
1165 } else { 1183 } else {
@@ -1167,14 +1185,14 @@ xfs_page_state_convert(
1167 } 1185 }
1168 1186
1169 err = xfs_map_blocks(inode, offset, size, 1187 err = xfs_map_blocks(inode, offset, size,
1170 &iomap, flags); 1188 &imap, flags);
1171 if (err) 1189 if (err)
1172 goto error; 1190 goto error;
1173 iomap_valid = xfs_iomap_valid(&iomap, offset); 1191 imap_valid = xfs_imap_valid(inode, &imap,
1192 offset);
1174 } 1193 }
1175 if (iomap_valid) { 1194 if (imap_valid) {
1176 xfs_map_at_offset(bh, offset, 1195 xfs_map_at_offset(inode, bh, &imap, offset);
1177 inode->i_blkbits, &iomap);
1178 if (startio) { 1196 if (startio) {
1179 xfs_add_to_ioend(inode, bh, offset, 1197 xfs_add_to_ioend(inode, bh, offset,
1180 type, &ioend, 1198 type, &ioend,
@@ -1193,40 +1211,41 @@ xfs_page_state_convert(
1193 * That means it must already have extents allocated 1211 * That means it must already have extents allocated
1194 * underneath it. Map the extent by reading it. 1212 * underneath it. Map the extent by reading it.
1195 */ 1213 */
1196 if (!iomap_valid || flags != BMAPI_READ) { 1214 if (!imap_valid || flags != BMAPI_READ) {
1197 flags = BMAPI_READ; 1215 flags = BMAPI_READ;
1198 size = xfs_probe_cluster(inode, page, bh, 1216 size = xfs_probe_cluster(inode, page, bh,
1199 head, 1); 1217 head, 1);
1200 err = xfs_map_blocks(inode, offset, size, 1218 err = xfs_map_blocks(inode, offset, size,
1201 &iomap, flags); 1219 &imap, flags);
1202 if (err) 1220 if (err)
1203 goto error; 1221 goto error;
1204 iomap_valid = xfs_iomap_valid(&iomap, offset); 1222 imap_valid = xfs_imap_valid(inode, &imap,
1223 offset);
1205 } 1224 }
1206 1225
1207 /* 1226 /*
1208 * We set the type to IOMAP_NEW in case we are doing a 1227 * We set the type to IO_NEW in case we are doing a
1209 * small write at EOF that is extending the file but 1228 * small write at EOF that is extending the file but
1210 * without needing an allocation. We need to update the 1229 * without needing an allocation. We need to update the
1211 * file size on I/O completion in this case so it is 1230 * file size on I/O completion in this case so it is
1212 * the same case as having just allocated a new extent 1231 * the same case as having just allocated a new extent
1213 * that we are writing into for the first time. 1232 * that we are writing into for the first time.
1214 */ 1233 */
1215 type = IOMAP_NEW; 1234 type = IO_NEW;
1216 if (trylock_buffer(bh)) { 1235 if (trylock_buffer(bh)) {
1217 ASSERT(buffer_mapped(bh)); 1236 ASSERT(buffer_mapped(bh));
1218 if (iomap_valid) 1237 if (imap_valid)
1219 all_bh = 1; 1238 all_bh = 1;
1220 xfs_add_to_ioend(inode, bh, offset, type, 1239 xfs_add_to_ioend(inode, bh, offset, type,
1221 &ioend, !iomap_valid); 1240 &ioend, !imap_valid);
1222 page_dirty--; 1241 page_dirty--;
1223 count++; 1242 count++;
1224 } else { 1243 } else {
1225 iomap_valid = 0; 1244 imap_valid = 0;
1226 } 1245 }
1227 } else if ((buffer_uptodate(bh) || PageUptodate(page)) && 1246 } else if ((buffer_uptodate(bh) || PageUptodate(page)) &&
1228 (unmapped || startio)) { 1247 (unmapped || startio)) {
1229 iomap_valid = 0; 1248 imap_valid = 0;
1230 } 1249 }
1231 1250
1232 if (!iohead) 1251 if (!iohead)
@@ -1240,12 +1259,23 @@ xfs_page_state_convert(
1240 if (startio) 1259 if (startio)
1241 xfs_start_page_writeback(page, 1, count); 1260 xfs_start_page_writeback(page, 1, count);
1242 1261
1243 if (ioend && iomap_valid) { 1262 if (ioend && imap_valid) {
1244 offset = (iomap.iomap_offset + iomap.iomap_bsize - 1) >> 1263 xfs_off_t end_index;
1245 PAGE_CACHE_SHIFT; 1264
1246 tlast = min_t(pgoff_t, offset, last_index); 1265 end_index = imap.br_startoff + imap.br_blockcount;
1247 xfs_cluster_write(inode, page->index + 1, &iomap, &ioend, 1266
1248 wbc, startio, all_bh, tlast); 1267 /* to bytes */
1268 end_index <<= inode->i_blkbits;
1269
1270 /* to pages */
1271 end_index = (end_index - 1) >> PAGE_CACHE_SHIFT;
1272
1273 /* check against file size */
1274 if (end_index > last_index)
1275 end_index = last_index;
1276
1277 xfs_cluster_write(inode, page->index + 1, &imap, &ioend,
1278 wbc, startio, all_bh, end_index);
1249 } 1279 }
1250 1280
1251 if (iohead) 1281 if (iohead)
@@ -1447,10 +1477,11 @@ __xfs_get_blocks(
1447 int direct, 1477 int direct,
1448 bmapi_flags_t flags) 1478 bmapi_flags_t flags)
1449{ 1479{
1450 xfs_iomap_t iomap; 1480 struct xfs_bmbt_irec imap;
1451 xfs_off_t offset; 1481 xfs_off_t offset;
1452 ssize_t size; 1482 ssize_t size;
1453 int niomap = 1; 1483 int nimap = 1;
1484 int new = 0;
1454 int error; 1485 int error;
1455 1486
1456 offset = (xfs_off_t)iblock << inode->i_blkbits; 1487 offset = (xfs_off_t)iblock << inode->i_blkbits;
@@ -1461,22 +1492,21 @@ __xfs_get_blocks(
1461 return 0; 1492 return 0;
1462 1493
1463 error = xfs_iomap(XFS_I(inode), offset, size, 1494 error = xfs_iomap(XFS_I(inode), offset, size,
1464 create ? flags : BMAPI_READ, &iomap, &niomap); 1495 create ? flags : BMAPI_READ, &imap, &nimap, &new);
1465 if (error) 1496 if (error)
1466 return -error; 1497 return -error;
1467 if (niomap == 0) 1498 if (nimap == 0)
1468 return 0; 1499 return 0;
1469 1500
1470 if (iomap.iomap_bn != IOMAP_DADDR_NULL) { 1501 if (imap.br_startblock != HOLESTARTBLOCK &&
1502 imap.br_startblock != DELAYSTARTBLOCK) {
1471 /* 1503 /*
1472 * For unwritten extents do not report a disk address on 1504 * For unwritten extents do not report a disk address on
1473 * the read case (treat as if we're reading into a hole). 1505 * the read case (treat as if we're reading into a hole).
1474 */ 1506 */
1475 if (create || !(iomap.iomap_flags & IOMAP_UNWRITTEN)) { 1507 if (create || !ISUNWRITTEN(&imap))
1476 xfs_map_buffer(bh_result, &iomap, offset, 1508 xfs_map_buffer(inode, bh_result, &imap, offset);
1477 inode->i_blkbits); 1509 if (create && ISUNWRITTEN(&imap)) {
1478 }
1479 if (create && (iomap.iomap_flags & IOMAP_UNWRITTEN)) {
1480 if (direct) 1510 if (direct)
1481 bh_result->b_private = inode; 1511 bh_result->b_private = inode;
1482 set_buffer_unwritten(bh_result); 1512 set_buffer_unwritten(bh_result);
@@ -1487,7 +1517,7 @@ __xfs_get_blocks(
1487 * If this is a realtime file, data may be on a different device. 1517 * If this is a realtime file, data may be on a different device.
1488 * to that pointed to from the buffer_head b_bdev currently. 1518 * to that pointed to from the buffer_head b_bdev currently.
1489 */ 1519 */
1490 bh_result->b_bdev = iomap.iomap_target->bt_bdev; 1520 bh_result->b_bdev = xfs_find_bdev_for_inode(inode);
1491 1521
1492 /* 1522 /*
1493 * If we previously allocated a block out beyond eof and we are now 1523 * If we previously allocated a block out beyond eof and we are now
@@ -1501,10 +1531,10 @@ __xfs_get_blocks(
1501 if (create && 1531 if (create &&
1502 ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) || 1532 ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) ||
1503 (offset >= i_size_read(inode)) || 1533 (offset >= i_size_read(inode)) ||
1504 (iomap.iomap_flags & (IOMAP_NEW|IOMAP_UNWRITTEN)))) 1534 (new || ISUNWRITTEN(&imap))))
1505 set_buffer_new(bh_result); 1535 set_buffer_new(bh_result);
1506 1536
1507 if (iomap.iomap_flags & IOMAP_DELAY) { 1537 if (imap.br_startblock == DELAYSTARTBLOCK) {
1508 BUG_ON(direct); 1538 BUG_ON(direct);
1509 if (create) { 1539 if (create) {
1510 set_buffer_uptodate(bh_result); 1540 set_buffer_uptodate(bh_result);
@@ -1513,11 +1543,23 @@ __xfs_get_blocks(
1513 } 1543 }
1514 } 1544 }
1515 1545
1546 /*
1547 * If this is O_DIRECT or the mpage code calling tell them how large
1548 * the mapping is, so that we can avoid repeated get_blocks calls.
1549 */
1516 if (direct || size > (1 << inode->i_blkbits)) { 1550 if (direct || size > (1 << inode->i_blkbits)) {
1517 ASSERT(iomap.iomap_bsize - iomap.iomap_delta > 0); 1551 xfs_off_t mapping_size;
1518 offset = min_t(xfs_off_t, 1552
1519 iomap.iomap_bsize - iomap.iomap_delta, size); 1553 mapping_size = imap.br_startoff + imap.br_blockcount - iblock;
1520 bh_result->b_size = (ssize_t)min_t(xfs_off_t, LONG_MAX, offset); 1554 mapping_size <<= inode->i_blkbits;
1555
1556 ASSERT(mapping_size > 0);
1557 if (mapping_size > size)
1558 mapping_size = size;
1559 if (mapping_size > LONG_MAX)
1560 mapping_size = LONG_MAX;
1561
1562 bh_result->b_size = mapping_size;
1521 } 1563 }
1522 1564
1523 return 0; 1565 return 0;
@@ -1575,7 +1617,7 @@ xfs_end_io_direct(
1575 */ 1617 */
1576 ioend->io_offset = offset; 1618 ioend->io_offset = offset;
1577 ioend->io_size = size; 1619 ioend->io_size = size;
1578 if (ioend->io_type == IOMAP_READ) { 1620 if (ioend->io_type == IO_READ) {
1579 xfs_finish_ioend(ioend, 0); 1621 xfs_finish_ioend(ioend, 0);
1580 } else if (private && size > 0) { 1622 } else if (private && size > 0) {
1581 xfs_finish_ioend(ioend, is_sync_kiocb(iocb)); 1623 xfs_finish_ioend(ioend, is_sync_kiocb(iocb));
@@ -1586,7 +1628,7 @@ xfs_end_io_direct(
1586 * didn't map an unwritten extent so switch it's completion 1628 * didn't map an unwritten extent so switch it's completion
1587 * handler. 1629 * handler.
1588 */ 1630 */
1589 ioend->io_type = IOMAP_NEW; 1631 ioend->io_type = IO_NEW;
1590 xfs_finish_ioend(ioend, 0); 1632 xfs_finish_ioend(ioend, 0);
1591 } 1633 }
1592 1634
@@ -1611,10 +1653,10 @@ xfs_vm_direct_IO(
1611 struct block_device *bdev; 1653 struct block_device *bdev;
1612 ssize_t ret; 1654 ssize_t ret;
1613 1655
1614 bdev = xfs_find_bdev_for_inode(XFS_I(inode)); 1656 bdev = xfs_find_bdev_for_inode(inode);
1615 1657
1616 iocb->private = xfs_alloc_ioend(inode, rw == WRITE ? 1658 iocb->private = xfs_alloc_ioend(inode, rw == WRITE ?
1617 IOMAP_UNWRITTEN : IOMAP_READ); 1659 IO_UNWRITTEN : IO_READ);
1618 1660
1619 ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov, 1661 ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov,
1620 offset, nr_segs, 1662 offset, nr_segs,
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index bd111b7e1daa..649ade8ef598 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -18,7 +18,7 @@
18#include "xfs.h" 18#include "xfs.h"
19#include <linux/stddef.h> 19#include <linux/stddef.h>
20#include <linux/errno.h> 20#include <linux/errno.h>
21#include <linux/slab.h> 21#include <linux/gfp.h>
22#include <linux/pagemap.h> 22#include <linux/pagemap.h>
23#include <linux/init.h> 23#include <linux/init.h>
24#include <linux/vmalloc.h> 24#include <linux/vmalloc.h>
@@ -37,6 +37,7 @@
37 37
38#include "xfs_sb.h" 38#include "xfs_sb.h"
39#include "xfs_inum.h" 39#include "xfs_inum.h"
40#include "xfs_log.h"
40#include "xfs_ag.h" 41#include "xfs_ag.h"
41#include "xfs_dmapi.h" 42#include "xfs_dmapi.h"
42#include "xfs_mount.h" 43#include "xfs_mount.h"
@@ -850,6 +851,12 @@ xfs_buf_lock_value(
850 * Note that this in no way locks the underlying pages, so it is only 851 * Note that this in no way locks the underlying pages, so it is only
851 * useful for synchronizing concurrent use of buffer objects, not for 852 * useful for synchronizing concurrent use of buffer objects, not for
852 * synchronizing independent access to the underlying pages. 853 * synchronizing independent access to the underlying pages.
854 *
855 * If we come across a stale, pinned, locked buffer, we know that we
856 * are being asked to lock a buffer that has been reallocated. Because
857 * it is pinned, we know that the log has not been pushed to disk and
858 * hence it will still be locked. Rather than sleeping until someone
859 * else pushes the log, push it ourselves before trying to get the lock.
853 */ 860 */
854void 861void
855xfs_buf_lock( 862xfs_buf_lock(
@@ -857,6 +864,8 @@ xfs_buf_lock(
857{ 864{
858 trace_xfs_buf_lock(bp, _RET_IP_); 865 trace_xfs_buf_lock(bp, _RET_IP_);
859 866
867 if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
868 xfs_log_force(bp->b_mount, 0);
860 if (atomic_read(&bp->b_io_remaining)) 869 if (atomic_read(&bp->b_io_remaining))
861 blk_run_address_space(bp->b_target->bt_mapping); 870 blk_run_address_space(bp->b_target->bt_mapping);
862 down(&bp->b_sema); 871 down(&bp->b_sema);
@@ -1007,25 +1016,20 @@ xfs_bwrite(
1007 struct xfs_mount *mp, 1016 struct xfs_mount *mp,
1008 struct xfs_buf *bp) 1017 struct xfs_buf *bp)
1009{ 1018{
1010 int iowait = (bp->b_flags & XBF_ASYNC) == 0; 1019 int error;
1011 int error = 0;
1012 1020
1013 bp->b_strat = xfs_bdstrat_cb; 1021 bp->b_strat = xfs_bdstrat_cb;
1014 bp->b_mount = mp; 1022 bp->b_mount = mp;
1015 bp->b_flags |= XBF_WRITE; 1023 bp->b_flags |= XBF_WRITE;
1016 if (!iowait) 1024 bp->b_flags &= ~(XBF_ASYNC | XBF_READ);
1017 bp->b_flags |= _XBF_RUN_QUEUES;
1018 1025
1019 xfs_buf_delwri_dequeue(bp); 1026 xfs_buf_delwri_dequeue(bp);
1020 xfs_buf_iostrategy(bp); 1027 xfs_buf_iostrategy(bp);
1021 1028
1022 if (iowait) { 1029 error = xfs_buf_iowait(bp);
1023 error = xfs_buf_iowait(bp); 1030 if (error)
1024 if (error) 1031 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1025 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 1032 xfs_buf_relse(bp);
1026 xfs_buf_relse(bp);
1027 }
1028
1029 return error; 1033 return error;
1030} 1034}
1031 1035
@@ -1614,7 +1618,8 @@ xfs_mapping_buftarg(
1614 1618
1615STATIC int 1619STATIC int
1616xfs_alloc_delwrite_queue( 1620xfs_alloc_delwrite_queue(
1617 xfs_buftarg_t *btp) 1621 xfs_buftarg_t *btp,
1622 const char *fsname)
1618{ 1623{
1619 int error = 0; 1624 int error = 0;
1620 1625
@@ -1622,7 +1627,7 @@ xfs_alloc_delwrite_queue(
1622 INIT_LIST_HEAD(&btp->bt_delwrite_queue); 1627 INIT_LIST_HEAD(&btp->bt_delwrite_queue);
1623 spin_lock_init(&btp->bt_delwrite_lock); 1628 spin_lock_init(&btp->bt_delwrite_lock);
1624 btp->bt_flags = 0; 1629 btp->bt_flags = 0;
1625 btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd"); 1630 btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname);
1626 if (IS_ERR(btp->bt_task)) { 1631 if (IS_ERR(btp->bt_task)) {
1627 error = PTR_ERR(btp->bt_task); 1632 error = PTR_ERR(btp->bt_task);
1628 goto out_error; 1633 goto out_error;
@@ -1635,7 +1640,8 @@ out_error:
1635xfs_buftarg_t * 1640xfs_buftarg_t *
1636xfs_alloc_buftarg( 1641xfs_alloc_buftarg(
1637 struct block_device *bdev, 1642 struct block_device *bdev,
1638 int external) 1643 int external,
1644 const char *fsname)
1639{ 1645{
1640 xfs_buftarg_t *btp; 1646 xfs_buftarg_t *btp;
1641 1647
@@ -1647,7 +1653,7 @@ xfs_alloc_buftarg(
1647 goto error; 1653 goto error;
1648 if (xfs_mapping_buftarg(btp, bdev)) 1654 if (xfs_mapping_buftarg(btp, bdev))
1649 goto error; 1655 goto error;
1650 if (xfs_alloc_delwrite_queue(btp)) 1656 if (xfs_alloc_delwrite_queue(btp, fsname))
1651 goto error; 1657 goto error;
1652 xfs_alloc_bufhash(btp, external); 1658 xfs_alloc_bufhash(btp, external);
1653 return btp; 1659 return btp;
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 386e7361e50e..5fbecefa5dfd 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -390,7 +390,7 @@ static inline void xfs_buf_relse(xfs_buf_t *bp)
390/* 390/*
391 * Handling of buftargs. 391 * Handling of buftargs.
392 */ 392 */
393extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int); 393extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int, const char *);
394extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *); 394extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *);
395extern void xfs_wait_buftarg(xfs_buftarg_t *); 395extern void xfs_wait_buftarg(xfs_buftarg_t *);
396extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int); 396extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 42dd3bcfba6b..257a56b127cf 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -100,10 +100,10 @@ xfs_iozero(
100STATIC int 100STATIC int
101xfs_file_fsync( 101xfs_file_fsync(
102 struct file *file, 102 struct file *file,
103 struct dentry *dentry,
104 int datasync) 103 int datasync)
105{ 104{
106 struct xfs_inode *ip = XFS_I(dentry->d_inode); 105 struct inode *inode = file->f_mapping->host;
106 struct xfs_inode *ip = XFS_I(inode);
107 struct xfs_trans *tp; 107 struct xfs_trans *tp;
108 int error = 0; 108 int error = 0;
109 int log_flushed = 0; 109 int log_flushed = 0;
@@ -115,6 +115,8 @@ xfs_file_fsync(
115 115
116 xfs_iflags_clear(ip, XFS_ITRUNCATED); 116 xfs_iflags_clear(ip, XFS_ITRUNCATED);
117 117
118 xfs_ioend_wait(ip);
119
118 /* 120 /*
119 * We always need to make sure that the required inode state is safe on 121 * We always need to make sure that the required inode state is safe on
120 * disk. The inode might be clean but we still might need to force the 122 * disk. The inode might be clean but we still might need to force the
@@ -138,8 +140,8 @@ xfs_file_fsync(
138 * might gets cleared when the inode gets written out via the AIL 140 * might gets cleared when the inode gets written out via the AIL
139 * or xfs_iflush_cluster. 141 * or xfs_iflush_cluster.
140 */ 142 */
141 if (((dentry->d_inode->i_state & I_DIRTY_DATASYNC) || 143 if (((inode->i_state & I_DIRTY_DATASYNC) ||
142 ((dentry->d_inode->i_state & I_DIRTY_SYNC) && !datasync)) && 144 ((inode->i_state & I_DIRTY_SYNC) && !datasync)) &&
143 ip->i_update_core) { 145 ip->i_update_core) {
144 /* 146 /*
145 * Kick off a transaction to log the inode core to get the 147 * Kick off a transaction to log the inode core to get the
@@ -866,7 +868,7 @@ write_retry:
866 mutex_lock(&inode->i_mutex); 868 mutex_lock(&inode->i_mutex);
867 xfs_ilock(ip, iolock); 869 xfs_ilock(ip, iolock);
868 870
869 error2 = -xfs_file_fsync(file, file->f_path.dentry, 871 error2 = -xfs_file_fsync(file,
870 (file->f_flags & __O_SYNC) ? 0 : 1); 872 (file->f_flags & __O_SYNC) ? 0 : 1);
871 if (!error) 873 if (!error)
872 error = error2; 874 error = error2;
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 4ea1ee18aded..699b60cbab9c 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -58,6 +58,7 @@
58#include <linux/mount.h> 58#include <linux/mount.h>
59#include <linux/namei.h> 59#include <linux/namei.h>
60#include <linux/pagemap.h> 60#include <linux/pagemap.h>
61#include <linux/slab.h>
61#include <linux/exportfs.h> 62#include <linux/exportfs.h>
62 63
63/* 64/*
@@ -526,6 +527,10 @@ xfs_attrmulti_by_handle(
526 if (copy_from_user(&am_hreq, arg, sizeof(xfs_fsop_attrmulti_handlereq_t))) 527 if (copy_from_user(&am_hreq, arg, sizeof(xfs_fsop_attrmulti_handlereq_t)))
527 return -XFS_ERROR(EFAULT); 528 return -XFS_ERROR(EFAULT);
528 529
530 /* overflow check */
531 if (am_hreq.opcount >= INT_MAX / sizeof(xfs_attr_multiop_t))
532 return -E2BIG;
533
529 dentry = xfs_handlereq_to_dentry(parfilp, &am_hreq.hreq); 534 dentry = xfs_handlereq_to_dentry(parfilp, &am_hreq.hreq);
530 if (IS_ERR(dentry)) 535 if (IS_ERR(dentry))
531 return PTR_ERR(dentry); 536 return PTR_ERR(dentry);
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index 0bf6d61f0528..9287135e9bfc 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -18,6 +18,7 @@
18#include <linux/compat.h> 18#include <linux/compat.h>
19#include <linux/ioctl.h> 19#include <linux/ioctl.h>
20#include <linux/mount.h> 20#include <linux/mount.h>
21#include <linux/slab.h>
21#include <asm/uaccess.h> 22#include <asm/uaccess.h>
22#include "xfs.h" 23#include "xfs.h"
23#include "xfs_fs.h" 24#include "xfs_fs.h"
@@ -419,6 +420,10 @@ xfs_compat_attrmulti_by_handle(
419 sizeof(compat_xfs_fsop_attrmulti_handlereq_t))) 420 sizeof(compat_xfs_fsop_attrmulti_handlereq_t)))
420 return -XFS_ERROR(EFAULT); 421 return -XFS_ERROR(EFAULT);
421 422
423 /* overflow check */
424 if (am_hreq.opcount >= INT_MAX / sizeof(compat_xfs_attr_multiop_t))
425 return -E2BIG;
426
422 dentry = xfs_compat_handlereq_to_dentry(parfilp, &am_hreq.hreq); 427 dentry = xfs_compat_handlereq_to_dentry(parfilp, &am_hreq.hreq);
423 if (IS_ERR(dentry)) 428 if (IS_ERR(dentry))
424 return PTR_ERR(dentry); 429 return PTR_ERR(dentry);
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 61a99608731e..9c8019c78c92 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -56,6 +56,7 @@
56#include <linux/security.h> 56#include <linux/security.h>
57#include <linux/falloc.h> 57#include <linux/falloc.h>
58#include <linux/fiemap.h> 58#include <linux/fiemap.h>
59#include <linux/slab.h>
59 60
60/* 61/*
61 * Bring the timestamps in the XFS inode uptodate. 62 * Bring the timestamps in the XFS inode uptodate.
@@ -672,7 +673,10 @@ xfs_vn_fiemap(
672 bm.bmv_length = BTOBB(length); 673 bm.bmv_length = BTOBB(length);
673 674
674 /* We add one because in getbmap world count includes the header */ 675 /* We add one because in getbmap world count includes the header */
675 bm.bmv_count = fieinfo->fi_extents_max + 1; 676 bm.bmv_count = !fieinfo->fi_extents_max ? MAXEXTNUM :
677 fieinfo->fi_extents_max + 1;
678 bm.bmv_count = min_t(__s32, bm.bmv_count,
679 (PAGE_SIZE * 16 / sizeof(struct getbmapx)));
676 bm.bmv_iflags = BMV_IF_PREALLOC; 680 bm.bmv_iflags = BMV_IF_PREALLOC;
677 if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) 681 if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR)
678 bm.bmv_iflags |= BMV_IF_ATTRFORK; 682 bm.bmv_iflags |= BMV_IF_ATTRFORK;
diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c
index 1947514ce1ad..9ac8aea91529 100644
--- a/fs/xfs/linux-2.6/xfs_quotaops.c
+++ b/fs/xfs/linux-2.6/xfs_quotaops.c
@@ -19,6 +19,7 @@
19#include "xfs_dmapi.h" 19#include "xfs_dmapi.h"
20#include "xfs_sb.h" 20#include "xfs_sb.h"
21#include "xfs_inum.h" 21#include "xfs_inum.h"
22#include "xfs_log.h"
22#include "xfs_ag.h" 23#include "xfs_ag.h"
23#include "xfs_mount.h" 24#include "xfs_mount.h"
24#include "xfs_quota.h" 25#include "xfs_quota.h"
@@ -97,7 +98,7 @@ xfs_fs_set_xstate(
97} 98}
98 99
99STATIC int 100STATIC int
100xfs_fs_get_xquota( 101xfs_fs_get_dqblk(
101 struct super_block *sb, 102 struct super_block *sb,
102 int type, 103 int type,
103 qid_t id, 104 qid_t id,
@@ -114,7 +115,7 @@ xfs_fs_get_xquota(
114} 115}
115 116
116STATIC int 117STATIC int
117xfs_fs_set_xquota( 118xfs_fs_set_dqblk(
118 struct super_block *sb, 119 struct super_block *sb,
119 int type, 120 int type,
120 qid_t id, 121 qid_t id,
@@ -135,6 +136,6 @@ xfs_fs_set_xquota(
135const struct quotactl_ops xfs_quotactl_operations = { 136const struct quotactl_ops xfs_quotactl_operations = {
136 .get_xstate = xfs_fs_get_xstate, 137 .get_xstate = xfs_fs_get_xstate,
137 .set_xstate = xfs_fs_set_xstate, 138 .set_xstate = xfs_fs_set_xstate,
138 .get_xquota = xfs_fs_get_xquota, 139 .get_dqblk = xfs_fs_get_dqblk,
139 .set_xquota = xfs_fs_set_xquota, 140 .set_dqblk = xfs_fs_set_dqblk,
140}; 141};
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 71345a370d9f..f2d1718c9165 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -61,6 +61,7 @@
61 61
62#include <linux/namei.h> 62#include <linux/namei.h>
63#include <linux/init.h> 63#include <linux/init.h>
64#include <linux/slab.h>
64#include <linux/mount.h> 65#include <linux/mount.h>
65#include <linux/mempool.h> 66#include <linux/mempool.h>
66#include <linux/writeback.h> 67#include <linux/writeback.h>
@@ -118,6 +119,8 @@ mempool_t *xfs_ioend_pool;
118#define MNTOPT_DMAPI "dmapi" /* DMI enabled (DMAPI / XDSM) */ 119#define MNTOPT_DMAPI "dmapi" /* DMI enabled (DMAPI / XDSM) */
119#define MNTOPT_XDSM "xdsm" /* DMI enabled (DMAPI / XDSM) */ 120#define MNTOPT_XDSM "xdsm" /* DMI enabled (DMAPI / XDSM) */
120#define MNTOPT_DMI "dmi" /* DMI enabled (DMAPI / XDSM) */ 121#define MNTOPT_DMI "dmi" /* DMI enabled (DMAPI / XDSM) */
122#define MNTOPT_DELAYLOG "delaylog" /* Delayed loging enabled */
123#define MNTOPT_NODELAYLOG "nodelaylog" /* Delayed loging disabled */
121 124
122/* 125/*
123 * Table driven mount option parser. 126 * Table driven mount option parser.
@@ -373,6 +376,13 @@ xfs_parseargs(
373 mp->m_flags |= XFS_MOUNT_DMAPI; 376 mp->m_flags |= XFS_MOUNT_DMAPI;
374 } else if (!strcmp(this_char, MNTOPT_DMI)) { 377 } else if (!strcmp(this_char, MNTOPT_DMI)) {
375 mp->m_flags |= XFS_MOUNT_DMAPI; 378 mp->m_flags |= XFS_MOUNT_DMAPI;
379 } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) {
380 mp->m_flags |= XFS_MOUNT_DELAYLOG;
381 cmn_err(CE_WARN,
382 "Enabling EXPERIMENTAL delayed logging feature "
383 "- use at your own risk.\n");
384 } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) {
385 mp->m_flags &= ~XFS_MOUNT_DELAYLOG;
376 } else if (!strcmp(this_char, "ihashsize")) { 386 } else if (!strcmp(this_char, "ihashsize")) {
377 cmn_err(CE_WARN, 387 cmn_err(CE_WARN,
378 "XFS: ihashsize no longer used, option is deprecated."); 388 "XFS: ihashsize no longer used, option is deprecated.");
@@ -534,6 +544,7 @@ xfs_showargs(
534 { XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM }, 544 { XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM },
535 { XFS_MOUNT_DMAPI, "," MNTOPT_DMAPI }, 545 { XFS_MOUNT_DMAPI, "," MNTOPT_DMAPI },
536 { XFS_MOUNT_GRPID, "," MNTOPT_GRPID }, 546 { XFS_MOUNT_GRPID, "," MNTOPT_GRPID },
547 { XFS_MOUNT_DELAYLOG, "," MNTOPT_DELAYLOG },
537 { 0, NULL } 548 { 0, NULL }
538 }; 549 };
539 static struct proc_xfs_info xfs_info_unset[] = { 550 static struct proc_xfs_info xfs_info_unset[] = {
@@ -724,7 +735,8 @@ void
724xfs_blkdev_issue_flush( 735xfs_blkdev_issue_flush(
725 xfs_buftarg_t *buftarg) 736 xfs_buftarg_t *buftarg)
726{ 737{
727 blkdev_issue_flush(buftarg->bt_bdev, NULL); 738 blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL,
739 BLKDEV_IFL_WAIT);
728} 740}
729 741
730STATIC void 742STATIC void
@@ -788,18 +800,18 @@ xfs_open_devices(
788 * Setup xfs_mount buffer target pointers 800 * Setup xfs_mount buffer target pointers
789 */ 801 */
790 error = ENOMEM; 802 error = ENOMEM;
791 mp->m_ddev_targp = xfs_alloc_buftarg(ddev, 0); 803 mp->m_ddev_targp = xfs_alloc_buftarg(ddev, 0, mp->m_fsname);
792 if (!mp->m_ddev_targp) 804 if (!mp->m_ddev_targp)
793 goto out_close_rtdev; 805 goto out_close_rtdev;
794 806
795 if (rtdev) { 807 if (rtdev) {
796 mp->m_rtdev_targp = xfs_alloc_buftarg(rtdev, 1); 808 mp->m_rtdev_targp = xfs_alloc_buftarg(rtdev, 1, mp->m_fsname);
797 if (!mp->m_rtdev_targp) 809 if (!mp->m_rtdev_targp)
798 goto out_free_ddev_targ; 810 goto out_free_ddev_targ;
799 } 811 }
800 812
801 if (logdev && logdev != ddev) { 813 if (logdev && logdev != ddev) {
802 mp->m_logdev_targp = xfs_alloc_buftarg(logdev, 1); 814 mp->m_logdev_targp = xfs_alloc_buftarg(logdev, 1, mp->m_fsname);
803 if (!mp->m_logdev_targp) 815 if (!mp->m_logdev_targp)
804 goto out_free_rtdev_targ; 816 goto out_free_rtdev_targ;
805 } else { 817 } else {
@@ -901,7 +913,8 @@ xfsaild_start(
901 struct xfs_ail *ailp) 913 struct xfs_ail *ailp)
902{ 914{
903 ailp->xa_target = 0; 915 ailp->xa_target = 0;
904 ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild"); 916 ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild/%s",
917 ailp->xa_mount->m_fsname);
905 if (IS_ERR(ailp->xa_task)) 918 if (IS_ERR(ailp->xa_task))
906 return -PTR_ERR(ailp->xa_task); 919 return -PTR_ERR(ailp->xa_task);
907 return 0; 920 return 0;
@@ -1091,6 +1104,7 @@ xfs_fs_write_inode(
1091 * the code will only flush the inode if it isn't already 1104 * the code will only flush the inode if it isn't already
1092 * being flushed. 1105 * being flushed.
1093 */ 1106 */
1107 xfs_ioend_wait(ip);
1094 xfs_ilock(ip, XFS_ILOCK_SHARED); 1108 xfs_ilock(ip, XFS_ILOCK_SHARED);
1095 if (ip->i_update_core) { 1109 if (ip->i_update_core) {
1096 error = xfs_log_inode(ip); 1110 error = xfs_log_inode(ip);
@@ -1208,6 +1222,7 @@ xfs_fs_put_super(
1208 1222
1209 xfs_unmountfs(mp); 1223 xfs_unmountfs(mp);
1210 xfs_freesb(mp); 1224 xfs_freesb(mp);
1225 xfs_inode_shrinker_unregister(mp);
1211 xfs_icsb_destroy_counters(mp); 1226 xfs_icsb_destroy_counters(mp);
1212 xfs_close_devices(mp); 1227 xfs_close_devices(mp);
1213 xfs_dmops_put(mp); 1228 xfs_dmops_put(mp);
@@ -1621,6 +1636,8 @@ xfs_fs_fill_super(
1621 if (error) 1636 if (error)
1622 goto fail_vnrele; 1637 goto fail_vnrele;
1623 1638
1639 xfs_inode_shrinker_register(mp);
1640
1624 kfree(mtpt); 1641 kfree(mtpt);
1625 return 0; 1642 return 0;
1626 1643
@@ -1748,7 +1765,7 @@ xfs_init_zones(void)
1748 * but it is much faster. 1765 * but it is much faster.
1749 */ 1766 */
1750 xfs_buf_item_zone = kmem_zone_init((sizeof(xfs_buf_log_item_t) + 1767 xfs_buf_item_zone = kmem_zone_init((sizeof(xfs_buf_log_item_t) +
1751 (((XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK) / 1768 (((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) /
1752 NBWORD) * sizeof(int))), "xfs_buf_item"); 1769 NBWORD) * sizeof(int))), "xfs_buf_item");
1753 if (!xfs_buf_item_zone) 1770 if (!xfs_buf_item_zone)
1754 goto out_destroy_trans_zone; 1771 goto out_destroy_trans_zone;
@@ -1866,6 +1883,7 @@ init_xfs_fs(void)
1866 goto out_cleanup_procfs; 1883 goto out_cleanup_procfs;
1867 1884
1868 vfs_initquota(); 1885 vfs_initquota();
1886 xfs_inode_shrinker_init();
1869 1887
1870 error = register_filesystem(&xfs_fs_type); 1888 error = register_filesystem(&xfs_fs_type);
1871 if (error) 1889 if (error)
@@ -1893,6 +1911,7 @@ exit_xfs_fs(void)
1893{ 1911{
1894 vfs_exitquota(); 1912 vfs_exitquota();
1895 unregister_filesystem(&xfs_fs_type); 1913 unregister_filesystem(&xfs_fs_type);
1914 xfs_inode_shrinker_destroy();
1896 xfs_sysctl_unregister(); 1915 xfs_sysctl_unregister();
1897 xfs_cleanup_procfs(); 1916 xfs_cleanup_procfs();
1898 xfs_buf_terminate(); 1917 xfs_buf_terminate();
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index 233d4b9881b1..519618e9279e 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -85,7 +85,7 @@ extern __uint64_t xfs_max_file_offset(unsigned int);
85extern void xfs_blkdev_issue_flush(struct xfs_buftarg *); 85extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
86 86
87extern const struct export_operations xfs_export_operations; 87extern const struct export_operations xfs_export_operations;
88extern struct xattr_handler *xfs_xattr_handlers[]; 88extern const struct xattr_handler *xfs_xattr_handlers[];
89extern const struct quotactl_ops xfs_quotactl_operations; 89extern const struct quotactl_ops xfs_quotactl_operations;
90 90
91#define XFS_M(sb) ((struct xfs_mount *)((sb)->s_fs_info)) 91#define XFS_M(sb) ((struct xfs_mount *)((sb)->s_fs_info))
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 05cd85317f6f..3884e20bc14e 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -95,7 +95,8 @@ xfs_inode_ag_walk(
95 struct xfs_perag *pag, int flags), 95 struct xfs_perag *pag, int flags),
96 int flags, 96 int flags,
97 int tag, 97 int tag,
98 int exclusive) 98 int exclusive,
99 int *nr_to_scan)
99{ 100{
100 uint32_t first_index; 101 uint32_t first_index;
101 int last_error = 0; 102 int last_error = 0;
@@ -134,7 +135,7 @@ restart:
134 if (error == EFSCORRUPTED) 135 if (error == EFSCORRUPTED)
135 break; 136 break;
136 137
137 } while (1); 138 } while ((*nr_to_scan)--);
138 139
139 if (skipped) { 140 if (skipped) {
140 delay(1); 141 delay(1);
@@ -150,12 +151,15 @@ xfs_inode_ag_iterator(
150 struct xfs_perag *pag, int flags), 151 struct xfs_perag *pag, int flags),
151 int flags, 152 int flags,
152 int tag, 153 int tag,
153 int exclusive) 154 int exclusive,
155 int *nr_to_scan)
154{ 156{
155 int error = 0; 157 int error = 0;
156 int last_error = 0; 158 int last_error = 0;
157 xfs_agnumber_t ag; 159 xfs_agnumber_t ag;
160 int nr;
158 161
162 nr = nr_to_scan ? *nr_to_scan : INT_MAX;
159 for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) { 163 for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
160 struct xfs_perag *pag; 164 struct xfs_perag *pag;
161 165
@@ -165,14 +169,18 @@ xfs_inode_ag_iterator(
165 continue; 169 continue;
166 } 170 }
167 error = xfs_inode_ag_walk(mp, pag, execute, flags, tag, 171 error = xfs_inode_ag_walk(mp, pag, execute, flags, tag,
168 exclusive); 172 exclusive, &nr);
169 xfs_perag_put(pag); 173 xfs_perag_put(pag);
170 if (error) { 174 if (error) {
171 last_error = error; 175 last_error = error;
172 if (error == EFSCORRUPTED) 176 if (error == EFSCORRUPTED)
173 break; 177 break;
174 } 178 }
179 if (nr <= 0)
180 break;
175 } 181 }
182 if (nr_to_scan)
183 *nr_to_scan = nr;
176 return XFS_ERROR(last_error); 184 return XFS_ERROR(last_error);
177} 185}
178 186
@@ -291,7 +299,7 @@ xfs_sync_data(
291 ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0); 299 ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
292 300
293 error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags, 301 error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags,
294 XFS_ICI_NO_TAG, 0); 302 XFS_ICI_NO_TAG, 0, NULL);
295 if (error) 303 if (error)
296 return XFS_ERROR(error); 304 return XFS_ERROR(error);
297 305
@@ -310,7 +318,7 @@ xfs_sync_attr(
310 ASSERT((flags & ~SYNC_WAIT) == 0); 318 ASSERT((flags & ~SYNC_WAIT) == 0);
311 319
312 return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags, 320 return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags,
313 XFS_ICI_NO_TAG, 0); 321 XFS_ICI_NO_TAG, 0, NULL);
314} 322}
315 323
316STATIC int 324STATIC int
@@ -348,68 +356,23 @@ xfs_commit_dummy_trans(
348 356
349STATIC int 357STATIC int
350xfs_sync_fsdata( 358xfs_sync_fsdata(
351 struct xfs_mount *mp, 359 struct xfs_mount *mp)
352 int flags)
353{ 360{
354 struct xfs_buf *bp; 361 struct xfs_buf *bp;
355 struct xfs_buf_log_item *bip;
356 int error = 0;
357
358 /*
359 * If this is xfssyncd() then only sync the superblock if we can
360 * lock it without sleeping and it is not pinned.
361 */
362 if (flags & SYNC_TRYLOCK) {
363 ASSERT(!(flags & SYNC_WAIT));
364
365 bp = xfs_getsb(mp, XBF_TRYLOCK);
366 if (!bp)
367 goto out;
368
369 bip = XFS_BUF_FSPRIVATE(bp, struct xfs_buf_log_item *);
370 if (!bip || !xfs_buf_item_dirty(bip) || XFS_BUF_ISPINNED(bp))
371 goto out_brelse;
372 } else {
373 bp = xfs_getsb(mp, 0);
374
375 /*
376 * If the buffer is pinned then push on the log so we won't
377 * get stuck waiting in the write for someone, maybe
378 * ourselves, to flush the log.
379 *
380 * Even though we just pushed the log above, we did not have
381 * the superblock buffer locked at that point so it can
382 * become pinned in between there and here.
383 */
384 if (XFS_BUF_ISPINNED(bp))
385 xfs_log_force(mp, 0);
386 }
387
388
389 if (flags & SYNC_WAIT)
390 XFS_BUF_UNASYNC(bp);
391 else
392 XFS_BUF_ASYNC(bp);
393
394 error = xfs_bwrite(mp, bp);
395 if (error)
396 return error;
397 362
398 /* 363 /*
399 * If this is a data integrity sync make sure all pending buffers 364 * If the buffer is pinned then push on the log so we won't get stuck
400 * are flushed out for the log coverage check below. 365 * waiting in the write for someone, maybe ourselves, to flush the log.
366 *
367 * Even though we just pushed the log above, we did not have the
368 * superblock buffer locked at that point so it can become pinned in
369 * between there and here.
401 */ 370 */
402 if (flags & SYNC_WAIT) 371 bp = xfs_getsb(mp, 0);
403 xfs_flush_buftarg(mp->m_ddev_targp, 1); 372 if (XFS_BUF_ISPINNED(bp))
404 373 xfs_log_force(mp, 0);
405 if (xfs_log_need_covered(mp))
406 error = xfs_commit_dummy_trans(mp, flags);
407 return error;
408 374
409 out_brelse: 375 return xfs_bwrite(mp, bp);
410 xfs_buf_relse(bp);
411 out:
412 return error;
413} 376}
414 377
415/* 378/*
@@ -433,7 +396,7 @@ int
433xfs_quiesce_data( 396xfs_quiesce_data(
434 struct xfs_mount *mp) 397 struct xfs_mount *mp)
435{ 398{
436 int error; 399 int error, error2 = 0;
437 400
438 /* push non-blocking */ 401 /* push non-blocking */
439 xfs_sync_data(mp, 0); 402 xfs_sync_data(mp, 0);
@@ -444,13 +407,20 @@ xfs_quiesce_data(
444 xfs_qm_sync(mp, SYNC_WAIT); 407 xfs_qm_sync(mp, SYNC_WAIT);
445 408
446 /* write superblock and hoover up shutdown errors */ 409 /* write superblock and hoover up shutdown errors */
447 error = xfs_sync_fsdata(mp, SYNC_WAIT); 410 error = xfs_sync_fsdata(mp);
411
412 /* make sure all delwri buffers are written out */
413 xfs_flush_buftarg(mp->m_ddev_targp, 1);
414
415 /* mark the log as covered if needed */
416 if (xfs_log_need_covered(mp))
417 error2 = xfs_commit_dummy_trans(mp, SYNC_WAIT);
448 418
449 /* flush data-only devices */ 419 /* flush data-only devices */
450 if (mp->m_rtdev_targp) 420 if (mp->m_rtdev_targp)
451 XFS_bflush(mp->m_rtdev_targp); 421 XFS_bflush(mp->m_rtdev_targp);
452 422
453 return error; 423 return error ? error : error2;
454} 424}
455 425
456STATIC void 426STATIC void
@@ -573,9 +543,9 @@ xfs_flush_inodes(
573} 543}
574 544
575/* 545/*
576 * Every sync period we need to unpin all items, reclaim inodes, sync 546 * Every sync period we need to unpin all items, reclaim inodes and sync
577 * quota and write out the superblock. We might need to cover the log 547 * disk quotas. We might need to cover the log to indicate that the
578 * to indicate it is idle. 548 * filesystem is idle.
579 */ 549 */
580STATIC void 550STATIC void
581xfs_sync_worker( 551xfs_sync_worker(
@@ -589,7 +559,8 @@ xfs_sync_worker(
589 xfs_reclaim_inodes(mp, 0); 559 xfs_reclaim_inodes(mp, 0);
590 /* dgc: errors ignored here */ 560 /* dgc: errors ignored here */
591 error = xfs_qm_sync(mp, SYNC_TRYLOCK); 561 error = xfs_qm_sync(mp, SYNC_TRYLOCK);
592 error = xfs_sync_fsdata(mp, SYNC_TRYLOCK); 562 if (xfs_log_need_covered(mp))
563 error = xfs_commit_dummy_trans(mp, 0);
593 } 564 }
594 mp->m_sync_seq++; 565 mp->m_sync_seq++;
595 wake_up(&mp->m_wait_single_sync_task); 566 wake_up(&mp->m_wait_single_sync_task);
@@ -652,7 +623,7 @@ xfs_syncd_init(
652 mp->m_sync_work.w_syncer = xfs_sync_worker; 623 mp->m_sync_work.w_syncer = xfs_sync_worker;
653 mp->m_sync_work.w_mount = mp; 624 mp->m_sync_work.w_mount = mp;
654 mp->m_sync_work.w_completion = NULL; 625 mp->m_sync_work.w_completion = NULL;
655 mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd"); 626 mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd/%s", mp->m_fsname);
656 if (IS_ERR(mp->m_sync_task)) 627 if (IS_ERR(mp->m_sync_task))
657 return -PTR_ERR(mp->m_sync_task); 628 return -PTR_ERR(mp->m_sync_task);
658 return 0; 629 return 0;
@@ -673,6 +644,7 @@ __xfs_inode_set_reclaim_tag(
673 radix_tree_tag_set(&pag->pag_ici_root, 644 radix_tree_tag_set(&pag->pag_ici_root,
674 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), 645 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
675 XFS_ICI_RECLAIM_TAG); 646 XFS_ICI_RECLAIM_TAG);
647 pag->pag_ici_reclaimable++;
676} 648}
677 649
678/* 650/*
@@ -705,6 +677,7 @@ __xfs_inode_clear_reclaim_tag(
705{ 677{
706 radix_tree_tag_clear(&pag->pag_ici_root, 678 radix_tree_tag_clear(&pag->pag_ici_root,
707 XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); 679 XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
680 pag->pag_ici_reclaimable--;
708} 681}
709 682
710/* 683/*
@@ -820,10 +793,10 @@ xfs_reclaim_inode(
820 * call into reclaim to find it in a clean state instead of waiting for 793 * call into reclaim to find it in a clean state instead of waiting for
821 * it now. We also don't return errors here - if the error is transient 794 * it now. We also don't return errors here - if the error is transient
822 * then the next reclaim pass will flush the inode, and if the error 795 * then the next reclaim pass will flush the inode, and if the error
823 * is permanent then the next sync reclaim will relcaim the inode and 796 * is permanent then the next sync reclaim will reclaim the inode and
824 * pass on the error. 797 * pass on the error.
825 */ 798 */
826 if (error && !XFS_FORCED_SHUTDOWN(ip->i_mount)) { 799 if (error && error != EAGAIN && !XFS_FORCED_SHUTDOWN(ip->i_mount)) {
827 xfs_fs_cmn_err(CE_WARN, ip->i_mount, 800 xfs_fs_cmn_err(CE_WARN, ip->i_mount,
828 "inode 0x%llx background reclaim flush failed with %d", 801 "inode 0x%llx background reclaim flush failed with %d",
829 (long long)ip->i_ino, error); 802 (long long)ip->i_ino, error);
@@ -854,5 +827,93 @@ xfs_reclaim_inodes(
854 int mode) 827 int mode)
855{ 828{
856 return xfs_inode_ag_iterator(mp, xfs_reclaim_inode, mode, 829 return xfs_inode_ag_iterator(mp, xfs_reclaim_inode, mode,
857 XFS_ICI_RECLAIM_TAG, 1); 830 XFS_ICI_RECLAIM_TAG, 1, NULL);
831}
832
833/*
834 * Shrinker infrastructure.
835 *
836 * This is all far more complex than it needs to be. It adds a global list of
837 * mounts because the shrinkers can only call a global context. We need to make
838 * the shrinkers pass a context to avoid the need for global state.
839 */
840static LIST_HEAD(xfs_mount_list);
841static struct rw_semaphore xfs_mount_list_lock;
842
843static int
844xfs_reclaim_inode_shrink(
845 int nr_to_scan,
846 gfp_t gfp_mask)
847{
848 struct xfs_mount *mp;
849 struct xfs_perag *pag;
850 xfs_agnumber_t ag;
851 int reclaimable = 0;
852
853 if (nr_to_scan) {
854 if (!(gfp_mask & __GFP_FS))
855 return -1;
856
857 down_read(&xfs_mount_list_lock);
858 list_for_each_entry(mp, &xfs_mount_list, m_mplist) {
859 xfs_inode_ag_iterator(mp, xfs_reclaim_inode, 0,
860 XFS_ICI_RECLAIM_TAG, 1, &nr_to_scan);
861 if (nr_to_scan <= 0)
862 break;
863 }
864 up_read(&xfs_mount_list_lock);
865 }
866
867 down_read(&xfs_mount_list_lock);
868 list_for_each_entry(mp, &xfs_mount_list, m_mplist) {
869 for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
870
871 pag = xfs_perag_get(mp, ag);
872 if (!pag->pag_ici_init) {
873 xfs_perag_put(pag);
874 continue;
875 }
876 reclaimable += pag->pag_ici_reclaimable;
877 xfs_perag_put(pag);
878 }
879 }
880 up_read(&xfs_mount_list_lock);
881 return reclaimable;
882}
883
884static struct shrinker xfs_inode_shrinker = {
885 .shrink = xfs_reclaim_inode_shrink,
886 .seeks = DEFAULT_SEEKS,
887};
888
889void __init
890xfs_inode_shrinker_init(void)
891{
892 init_rwsem(&xfs_mount_list_lock);
893 register_shrinker(&xfs_inode_shrinker);
894}
895
896void
897xfs_inode_shrinker_destroy(void)
898{
899 ASSERT(list_empty(&xfs_mount_list));
900 unregister_shrinker(&xfs_inode_shrinker);
901}
902
903void
904xfs_inode_shrinker_register(
905 struct xfs_mount *mp)
906{
907 down_write(&xfs_mount_list_lock);
908 list_add_tail(&mp->m_mplist, &xfs_mount_list);
909 up_write(&xfs_mount_list_lock);
910}
911
912void
913xfs_inode_shrinker_unregister(
914 struct xfs_mount *mp)
915{
916 down_write(&xfs_mount_list_lock);
917 list_del(&mp->m_mplist);
918 up_write(&xfs_mount_list_lock);
858} 919}
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index d480c346cabb..cdcbaaca9880 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -53,6 +53,11 @@ void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
53int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag); 53int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag);
54int xfs_inode_ag_iterator(struct xfs_mount *mp, 54int xfs_inode_ag_iterator(struct xfs_mount *mp,
55 int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags), 55 int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
56 int flags, int tag, int write_lock); 56 int flags, int tag, int write_lock, int *nr_to_scan);
57
58void xfs_inode_shrinker_init(void);
59void xfs_inode_shrinker_destroy(void);
60void xfs_inode_shrinker_register(struct xfs_mount *mp);
61void xfs_inode_shrinker_unregister(struct xfs_mount *mp);
57 62
58#endif 63#endif
diff --git a/fs/xfs/linux-2.6/xfs_trace.c b/fs/xfs/linux-2.6/xfs_trace.c
index 5a107601e969..207fa77f63ae 100644
--- a/fs/xfs/linux-2.6/xfs_trace.c
+++ b/fs/xfs/linux-2.6/xfs_trace.c
@@ -41,7 +41,6 @@
41#include "xfs_alloc.h" 41#include "xfs_alloc.h"
42#include "xfs_bmap.h" 42#include "xfs_bmap.h"
43#include "xfs_attr.h" 43#include "xfs_attr.h"
44#include "xfs_attr_sf.h"
45#include "xfs_attr_leaf.h" 44#include "xfs_attr_leaf.h"
46#include "xfs_log_priv.h" 45#include "xfs_log_priv.h"
47#include "xfs_buf_item.h" 46#include "xfs_buf_item.h"
@@ -50,6 +49,9 @@
50#include "xfs_aops.h" 49#include "xfs_aops.h"
51#include "quota/xfs_dquot_item.h" 50#include "quota/xfs_dquot_item.h"
52#include "quota/xfs_dquot.h" 51#include "quota/xfs_dquot.h"
52#include "xfs_log_recover.h"
53#include "xfs_buf_item.h"
54#include "xfs_inode_item.h"
53 55
54/* 56/*
55 * We include this last to have the helpers above available for the trace 57 * We include this last to have the helpers above available for the trace
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index fcaa62f0799e..ff6bc797baf2 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -32,6 +32,10 @@ struct xfs_da_node_entry;
32struct xfs_dquot; 32struct xfs_dquot;
33struct xlog_ticket; 33struct xlog_ticket;
34struct log; 34struct log;
35struct xlog_recover;
36struct xlog_recover_item;
37struct xfs_buf_log_format;
38struct xfs_inode_log_format;
35 39
36DECLARE_EVENT_CLASS(xfs_attr_list_class, 40DECLARE_EVENT_CLASS(xfs_attr_list_class,
37 TP_PROTO(struct xfs_attr_list_context *ctx), 41 TP_PROTO(struct xfs_attr_list_context *ctx),
@@ -562,18 +566,21 @@ DECLARE_EVENT_CLASS(xfs_inode_class,
562 __field(dev_t, dev) 566 __field(dev_t, dev)
563 __field(xfs_ino_t, ino) 567 __field(xfs_ino_t, ino)
564 __field(int, count) 568 __field(int, count)
569 __field(int, pincount)
565 __field(unsigned long, caller_ip) 570 __field(unsigned long, caller_ip)
566 ), 571 ),
567 TP_fast_assign( 572 TP_fast_assign(
568 __entry->dev = VFS_I(ip)->i_sb->s_dev; 573 __entry->dev = VFS_I(ip)->i_sb->s_dev;
569 __entry->ino = ip->i_ino; 574 __entry->ino = ip->i_ino;
570 __entry->count = atomic_read(&VFS_I(ip)->i_count); 575 __entry->count = atomic_read(&VFS_I(ip)->i_count);
576 __entry->pincount = atomic_read(&ip->i_pincount);
571 __entry->caller_ip = caller_ip; 577 __entry->caller_ip = caller_ip;
572 ), 578 ),
573 TP_printk("dev %d:%d ino 0x%llx count %d caller %pf", 579 TP_printk("dev %d:%d ino 0x%llx count %d pincount %d caller %pf",
574 MAJOR(__entry->dev), MINOR(__entry->dev), 580 MAJOR(__entry->dev), MINOR(__entry->dev),
575 __entry->ino, 581 __entry->ino,
576 __entry->count, 582 __entry->count,
583 __entry->pincount,
577 (char *)__entry->caller_ip) 584 (char *)__entry->caller_ip)
578) 585)
579 586
@@ -583,6 +590,10 @@ DEFINE_EVENT(xfs_inode_class, name, \
583 TP_ARGS(ip, caller_ip)) 590 TP_ARGS(ip, caller_ip))
584DEFINE_INODE_EVENT(xfs_ihold); 591DEFINE_INODE_EVENT(xfs_ihold);
585DEFINE_INODE_EVENT(xfs_irele); 592DEFINE_INODE_EVENT(xfs_irele);
593DEFINE_INODE_EVENT(xfs_inode_pin);
594DEFINE_INODE_EVENT(xfs_inode_unpin);
595DEFINE_INODE_EVENT(xfs_inode_unpin_nowait);
596
586/* the old xfs_itrace_entry tracer - to be replaced by s.th. in the VFS */ 597/* the old xfs_itrace_entry tracer - to be replaced by s.th. in the VFS */
587DEFINE_INODE_EVENT(xfs_inode); 598DEFINE_INODE_EVENT(xfs_inode);
588#define xfs_itrace_entry(ip) \ 599#define xfs_itrace_entry(ip) \
@@ -642,8 +653,6 @@ DEFINE_EVENT(xfs_dquot_class, name, \
642 TP_PROTO(struct xfs_dquot *dqp), \ 653 TP_PROTO(struct xfs_dquot *dqp), \
643 TP_ARGS(dqp)) 654 TP_ARGS(dqp))
644DEFINE_DQUOT_EVENT(xfs_dqadjust); 655DEFINE_DQUOT_EVENT(xfs_dqadjust);
645DEFINE_DQUOT_EVENT(xfs_dqshake_dirty);
646DEFINE_DQUOT_EVENT(xfs_dqshake_unlink);
647DEFINE_DQUOT_EVENT(xfs_dqreclaim_want); 656DEFINE_DQUOT_EVENT(xfs_dqreclaim_want);
648DEFINE_DQUOT_EVENT(xfs_dqreclaim_dirty); 657DEFINE_DQUOT_EVENT(xfs_dqreclaim_dirty);
649DEFINE_DQUOT_EVENT(xfs_dqreclaim_unlink); 658DEFINE_DQUOT_EVENT(xfs_dqreclaim_unlink);
@@ -658,7 +667,6 @@ DEFINE_DQUOT_EVENT(xfs_dqread_fail);
658DEFINE_DQUOT_EVENT(xfs_dqlookup_found); 667DEFINE_DQUOT_EVENT(xfs_dqlookup_found);
659DEFINE_DQUOT_EVENT(xfs_dqlookup_want); 668DEFINE_DQUOT_EVENT(xfs_dqlookup_want);
660DEFINE_DQUOT_EVENT(xfs_dqlookup_freelist); 669DEFINE_DQUOT_EVENT(xfs_dqlookup_freelist);
661DEFINE_DQUOT_EVENT(xfs_dqlookup_move);
662DEFINE_DQUOT_EVENT(xfs_dqlookup_done); 670DEFINE_DQUOT_EVENT(xfs_dqlookup_done);
663DEFINE_DQUOT_EVENT(xfs_dqget_hit); 671DEFINE_DQUOT_EVENT(xfs_dqget_hit);
664DEFINE_DQUOT_EVENT(xfs_dqget_miss); 672DEFINE_DQUOT_EVENT(xfs_dqget_miss);
@@ -1051,83 +1059,112 @@ TRACE_EVENT(xfs_bunmap,
1051 1059
1052); 1060);
1053 1061
1062#define XFS_BUSY_SYNC \
1063 { 0, "async" }, \
1064 { 1, "sync" }
1065
1054TRACE_EVENT(xfs_alloc_busy, 1066TRACE_EVENT(xfs_alloc_busy,
1055 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, 1067 TP_PROTO(struct xfs_trans *trans, xfs_agnumber_t agno,
1056 xfs_extlen_t len, int slot), 1068 xfs_agblock_t agbno, xfs_extlen_t len, int sync),
1057 TP_ARGS(mp, agno, agbno, len, slot), 1069 TP_ARGS(trans, agno, agbno, len, sync),
1058 TP_STRUCT__entry( 1070 TP_STRUCT__entry(
1059 __field(dev_t, dev) 1071 __field(dev_t, dev)
1072 __field(struct xfs_trans *, tp)
1073 __field(int, tid)
1060 __field(xfs_agnumber_t, agno) 1074 __field(xfs_agnumber_t, agno)
1061 __field(xfs_agblock_t, agbno) 1075 __field(xfs_agblock_t, agbno)
1062 __field(xfs_extlen_t, len) 1076 __field(xfs_extlen_t, len)
1063 __field(int, slot) 1077 __field(int, sync)
1064 ), 1078 ),
1065 TP_fast_assign( 1079 TP_fast_assign(
1066 __entry->dev = mp->m_super->s_dev; 1080 __entry->dev = trans->t_mountp->m_super->s_dev;
1081 __entry->tp = trans;
1082 __entry->tid = trans->t_ticket->t_tid;
1067 __entry->agno = agno; 1083 __entry->agno = agno;
1068 __entry->agbno = agbno; 1084 __entry->agbno = agbno;
1069 __entry->len = len; 1085 __entry->len = len;
1070 __entry->slot = slot; 1086 __entry->sync = sync;
1071 ), 1087 ),
1072 TP_printk("dev %d:%d agno %u agbno %u len %u slot %d", 1088 TP_printk("dev %d:%d trans 0x%p tid 0x%x agno %u agbno %u len %u %s",
1073 MAJOR(__entry->dev), MINOR(__entry->dev), 1089 MAJOR(__entry->dev), MINOR(__entry->dev),
1090 __entry->tp,
1091 __entry->tid,
1074 __entry->agno, 1092 __entry->agno,
1075 __entry->agbno, 1093 __entry->agbno,
1076 __entry->len, 1094 __entry->len,
1077 __entry->slot) 1095 __print_symbolic(__entry->sync, XFS_BUSY_SYNC))
1078 1096
1079); 1097);
1080 1098
1081#define XFS_BUSY_STATES \
1082 { 0, "found" }, \
1083 { 1, "missing" }
1084
1085TRACE_EVENT(xfs_alloc_unbusy, 1099TRACE_EVENT(xfs_alloc_unbusy,
1086 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, 1100 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
1087 int slot, int found), 1101 xfs_agblock_t agbno, xfs_extlen_t len),
1088 TP_ARGS(mp, agno, slot, found), 1102 TP_ARGS(mp, agno, agbno, len),
1089 TP_STRUCT__entry( 1103 TP_STRUCT__entry(
1090 __field(dev_t, dev) 1104 __field(dev_t, dev)
1091 __field(xfs_agnumber_t, agno) 1105 __field(xfs_agnumber_t, agno)
1092 __field(int, slot) 1106 __field(xfs_agblock_t, agbno)
1093 __field(int, found) 1107 __field(xfs_extlen_t, len)
1094 ), 1108 ),
1095 TP_fast_assign( 1109 TP_fast_assign(
1096 __entry->dev = mp->m_super->s_dev; 1110 __entry->dev = mp->m_super->s_dev;
1097 __entry->agno = agno; 1111 __entry->agno = agno;
1098 __entry->slot = slot; 1112 __entry->agbno = agbno;
1099 __entry->found = found; 1113 __entry->len = len;
1100 ), 1114 ),
1101 TP_printk("dev %d:%d agno %u slot %d %s", 1115 TP_printk("dev %d:%d agno %u agbno %u len %u",
1102 MAJOR(__entry->dev), MINOR(__entry->dev), 1116 MAJOR(__entry->dev), MINOR(__entry->dev),
1103 __entry->agno, 1117 __entry->agno,
1104 __entry->slot, 1118 __entry->agbno,
1105 __print_symbolic(__entry->found, XFS_BUSY_STATES)) 1119 __entry->len)
1106); 1120);
1107 1121
1122#define XFS_BUSY_STATES \
1123 { 0, "missing" }, \
1124 { 1, "found" }
1125
1108TRACE_EVENT(xfs_alloc_busysearch, 1126TRACE_EVENT(xfs_alloc_busysearch,
1109 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, 1127 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
1110 xfs_extlen_t len, xfs_lsn_t lsn), 1128 xfs_agblock_t agbno, xfs_extlen_t len, int found),
1111 TP_ARGS(mp, agno, agbno, len, lsn), 1129 TP_ARGS(mp, agno, agbno, len, found),
1112 TP_STRUCT__entry( 1130 TP_STRUCT__entry(
1113 __field(dev_t, dev) 1131 __field(dev_t, dev)
1114 __field(xfs_agnumber_t, agno) 1132 __field(xfs_agnumber_t, agno)
1115 __field(xfs_agblock_t, agbno) 1133 __field(xfs_agblock_t, agbno)
1116 __field(xfs_extlen_t, len) 1134 __field(xfs_extlen_t, len)
1117 __field(xfs_lsn_t, lsn) 1135 __field(int, found)
1118 ), 1136 ),
1119 TP_fast_assign( 1137 TP_fast_assign(
1120 __entry->dev = mp->m_super->s_dev; 1138 __entry->dev = mp->m_super->s_dev;
1121 __entry->agno = agno; 1139 __entry->agno = agno;
1122 __entry->agbno = agbno; 1140 __entry->agbno = agbno;
1123 __entry->len = len; 1141 __entry->len = len;
1124 __entry->lsn = lsn; 1142 __entry->found = found;
1125 ), 1143 ),
1126 TP_printk("dev %d:%d agno %u agbno %u len %u force lsn 0x%llx", 1144 TP_printk("dev %d:%d agno %u agbno %u len %u %s",
1127 MAJOR(__entry->dev), MINOR(__entry->dev), 1145 MAJOR(__entry->dev), MINOR(__entry->dev),
1128 __entry->agno, 1146 __entry->agno,
1129 __entry->agbno, 1147 __entry->agbno,
1130 __entry->len, 1148 __entry->len,
1149 __print_symbolic(__entry->found, XFS_BUSY_STATES))
1150);
1151
1152TRACE_EVENT(xfs_trans_commit_lsn,
1153 TP_PROTO(struct xfs_trans *trans),
1154 TP_ARGS(trans),
1155 TP_STRUCT__entry(
1156 __field(dev_t, dev)
1157 __field(struct xfs_trans *, tp)
1158 __field(xfs_lsn_t, lsn)
1159 ),
1160 TP_fast_assign(
1161 __entry->dev = trans->t_mountp->m_super->s_dev;
1162 __entry->tp = trans;
1163 __entry->lsn = trans->t_commit_lsn;
1164 ),
1165 TP_printk("dev %d:%d trans 0x%p commit_lsn 0x%llx",
1166 MAJOR(__entry->dev), MINOR(__entry->dev),
1167 __entry->tp,
1131 __entry->lsn) 1168 __entry->lsn)
1132); 1169);
1133 1170
@@ -1495,6 +1532,140 @@ DEFINE_EVENT(xfs_swap_extent_class, name, \
1495DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before); 1532DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before);
1496DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after); 1533DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after);
1497 1534
1535DECLARE_EVENT_CLASS(xfs_log_recover_item_class,
1536 TP_PROTO(struct log *log, struct xlog_recover *trans,
1537 struct xlog_recover_item *item, int pass),
1538 TP_ARGS(log, trans, item, pass),
1539 TP_STRUCT__entry(
1540 __field(dev_t, dev)
1541 __field(unsigned long, item)
1542 __field(xlog_tid_t, tid)
1543 __field(int, type)
1544 __field(int, pass)
1545 __field(int, count)
1546 __field(int, total)
1547 ),
1548 TP_fast_assign(
1549 __entry->dev = log->l_mp->m_super->s_dev;
1550 __entry->item = (unsigned long)item;
1551 __entry->tid = trans->r_log_tid;
1552 __entry->type = ITEM_TYPE(item);
1553 __entry->pass = pass;
1554 __entry->count = item->ri_cnt;
1555 __entry->total = item->ri_total;
1556 ),
1557 TP_printk("dev %d:%d trans 0x%x, pass %d, item 0x%p, item type %s "
1558 "item region count/total %d/%d",
1559 MAJOR(__entry->dev), MINOR(__entry->dev),
1560 __entry->tid,
1561 __entry->pass,
1562 (void *)__entry->item,
1563 __print_symbolic(__entry->type, XFS_LI_TYPE_DESC),
1564 __entry->count,
1565 __entry->total)
1566)
1567
1568#define DEFINE_LOG_RECOVER_ITEM(name) \
1569DEFINE_EVENT(xfs_log_recover_item_class, name, \
1570 TP_PROTO(struct log *log, struct xlog_recover *trans, \
1571 struct xlog_recover_item *item, int pass), \
1572 TP_ARGS(log, trans, item, pass))
1573
1574DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_add);
1575DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_add_cont);
1576DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_reorder_head);
1577DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_reorder_tail);
1578DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_recover);
1579
1580DECLARE_EVENT_CLASS(xfs_log_recover_buf_item_class,
1581 TP_PROTO(struct log *log, struct xfs_buf_log_format *buf_f),
1582 TP_ARGS(log, buf_f),
1583 TP_STRUCT__entry(
1584 __field(dev_t, dev)
1585 __field(__int64_t, blkno)
1586 __field(unsigned short, len)
1587 __field(unsigned short, flags)
1588 __field(unsigned short, size)
1589 __field(unsigned int, map_size)
1590 ),
1591 TP_fast_assign(
1592 __entry->dev = log->l_mp->m_super->s_dev;
1593 __entry->blkno = buf_f->blf_blkno;
1594 __entry->len = buf_f->blf_len;
1595 __entry->flags = buf_f->blf_flags;
1596 __entry->size = buf_f->blf_size;
1597 __entry->map_size = buf_f->blf_map_size;
1598 ),
1599 TP_printk("dev %d:%d blkno 0x%llx, len %u, flags 0x%x, size %d, "
1600 "map_size %d",
1601 MAJOR(__entry->dev), MINOR(__entry->dev),
1602 __entry->blkno,
1603 __entry->len,
1604 __entry->flags,
1605 __entry->size,
1606 __entry->map_size)
1607)
1608
1609#define DEFINE_LOG_RECOVER_BUF_ITEM(name) \
1610DEFINE_EVENT(xfs_log_recover_buf_item_class, name, \
1611 TP_PROTO(struct log *log, struct xfs_buf_log_format *buf_f), \
1612 TP_ARGS(log, buf_f))
1613
1614DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_not_cancel);
1615DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel);
1616DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_add);
1617DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_ref_inc);
1618DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_recover);
1619DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_inode_buf);
1620DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_reg_buf);
1621DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_dquot_buf);
1622
1623DECLARE_EVENT_CLASS(xfs_log_recover_ino_item_class,
1624 TP_PROTO(struct log *log, struct xfs_inode_log_format *in_f),
1625 TP_ARGS(log, in_f),
1626 TP_STRUCT__entry(
1627 __field(dev_t, dev)
1628 __field(xfs_ino_t, ino)
1629 __field(unsigned short, size)
1630 __field(int, fields)
1631 __field(unsigned short, asize)
1632 __field(unsigned short, dsize)
1633 __field(__int64_t, blkno)
1634 __field(int, len)
1635 __field(int, boffset)
1636 ),
1637 TP_fast_assign(
1638 __entry->dev = log->l_mp->m_super->s_dev;
1639 __entry->ino = in_f->ilf_ino;
1640 __entry->size = in_f->ilf_size;
1641 __entry->fields = in_f->ilf_fields;
1642 __entry->asize = in_f->ilf_asize;
1643 __entry->dsize = in_f->ilf_dsize;
1644 __entry->blkno = in_f->ilf_blkno;
1645 __entry->len = in_f->ilf_len;
1646 __entry->boffset = in_f->ilf_boffset;
1647 ),
1648 TP_printk("dev %d:%d ino 0x%llx, size %u, fields 0x%x, asize %d, "
1649 "dsize %d, blkno 0x%llx, len %d, boffset %d",
1650 MAJOR(__entry->dev), MINOR(__entry->dev),
1651 __entry->ino,
1652 __entry->size,
1653 __entry->fields,
1654 __entry->asize,
1655 __entry->dsize,
1656 __entry->blkno,
1657 __entry->len,
1658 __entry->boffset)
1659)
1660#define DEFINE_LOG_RECOVER_INO_ITEM(name) \
1661DEFINE_EVENT(xfs_log_recover_ino_item_class, name, \
1662 TP_PROTO(struct log *log, struct xfs_inode_log_format *in_f), \
1663 TP_ARGS(log, in_f))
1664
1665DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_recover);
1666DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel);
1667DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip);
1668
1498#endif /* _TRACE_XFS_H */ 1669#endif /* _TRACE_XFS_H */
1499 1670
1500#undef TRACE_INCLUDE_PATH 1671#undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/linux-2.6/xfs_xattr.c b/fs/xfs/linux-2.6/xfs_xattr.c
index fa01b9daba6b..87d3e03878c8 100644
--- a/fs/xfs/linux-2.6/xfs_xattr.c
+++ b/fs/xfs/linux-2.6/xfs_xattr.c
@@ -72,28 +72,28 @@ xfs_xattr_set(struct dentry *dentry, const char *name, const void *value,
72 (void *)value, size, xflags); 72 (void *)value, size, xflags);
73} 73}
74 74
75static struct xattr_handler xfs_xattr_user_handler = { 75static const struct xattr_handler xfs_xattr_user_handler = {
76 .prefix = XATTR_USER_PREFIX, 76 .prefix = XATTR_USER_PREFIX,
77 .flags = 0, /* no flags implies user namespace */ 77 .flags = 0, /* no flags implies user namespace */
78 .get = xfs_xattr_get, 78 .get = xfs_xattr_get,
79 .set = xfs_xattr_set, 79 .set = xfs_xattr_set,
80}; 80};
81 81
82static struct xattr_handler xfs_xattr_trusted_handler = { 82static const struct xattr_handler xfs_xattr_trusted_handler = {
83 .prefix = XATTR_TRUSTED_PREFIX, 83 .prefix = XATTR_TRUSTED_PREFIX,
84 .flags = ATTR_ROOT, 84 .flags = ATTR_ROOT,
85 .get = xfs_xattr_get, 85 .get = xfs_xattr_get,
86 .set = xfs_xattr_set, 86 .set = xfs_xattr_set,
87}; 87};
88 88
89static struct xattr_handler xfs_xattr_security_handler = { 89static const struct xattr_handler xfs_xattr_security_handler = {
90 .prefix = XATTR_SECURITY_PREFIX, 90 .prefix = XATTR_SECURITY_PREFIX,
91 .flags = ATTR_SECURE, 91 .flags = ATTR_SECURE,
92 .get = xfs_xattr_get, 92 .get = xfs_xattr_get,
93 .set = xfs_xattr_set, 93 .set = xfs_xattr_set,
94}; 94};
95 95
96struct xattr_handler *xfs_xattr_handlers[] = { 96const struct xattr_handler *xfs_xattr_handlers[] = {
97 &xfs_xattr_user_handler, 97 &xfs_xattr_user_handler,
98 &xfs_xattr_trusted_handler, 98 &xfs_xattr_trusted_handler,
99 &xfs_xattr_security_handler, 99 &xfs_xattr_security_handler,
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index 5f79dd78626b..585e7633dfc7 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -101,7 +101,7 @@ xfs_qm_dqinit(
101 * No need to re-initialize these if this is a reclaimed dquot. 101 * No need to re-initialize these if this is a reclaimed dquot.
102 */ 102 */
103 if (brandnewdquot) { 103 if (brandnewdquot) {
104 dqp->dq_flnext = dqp->dq_flprev = dqp; 104 INIT_LIST_HEAD(&dqp->q_freelist);
105 mutex_init(&dqp->q_qlock); 105 mutex_init(&dqp->q_qlock);
106 init_waitqueue_head(&dqp->q_pinwait); 106 init_waitqueue_head(&dqp->q_pinwait);
107 107
@@ -119,20 +119,20 @@ xfs_qm_dqinit(
119 * Only the q_core portion was zeroed in dqreclaim_one(). 119 * Only the q_core portion was zeroed in dqreclaim_one().
120 * So, we need to reset others. 120 * So, we need to reset others.
121 */ 121 */
122 dqp->q_nrefs = 0; 122 dqp->q_nrefs = 0;
123 dqp->q_blkno = 0; 123 dqp->q_blkno = 0;
124 dqp->MPL_NEXT = dqp->HL_NEXT = NULL; 124 INIT_LIST_HEAD(&dqp->q_mplist);
125 dqp->HL_PREVP = dqp->MPL_PREVP = NULL; 125 INIT_LIST_HEAD(&dqp->q_hashlist);
126 dqp->q_bufoffset = 0; 126 dqp->q_bufoffset = 0;
127 dqp->q_fileoffset = 0; 127 dqp->q_fileoffset = 0;
128 dqp->q_transp = NULL; 128 dqp->q_transp = NULL;
129 dqp->q_gdquot = NULL; 129 dqp->q_gdquot = NULL;
130 dqp->q_res_bcount = 0; 130 dqp->q_res_bcount = 0;
131 dqp->q_res_icount = 0; 131 dqp->q_res_icount = 0;
132 dqp->q_res_rtbcount = 0; 132 dqp->q_res_rtbcount = 0;
133 atomic_set(&dqp->q_pincount, 0); 133 atomic_set(&dqp->q_pincount, 0);
134 dqp->q_hash = NULL; 134 dqp->q_hash = NULL;
135 ASSERT(dqp->dq_flnext == dqp->dq_flprev); 135 ASSERT(list_empty(&dqp->q_freelist));
136 136
137 trace_xfs_dqreuse(dqp); 137 trace_xfs_dqreuse(dqp);
138 } 138 }
@@ -158,7 +158,7 @@ void
158xfs_qm_dqdestroy( 158xfs_qm_dqdestroy(
159 xfs_dquot_t *dqp) 159 xfs_dquot_t *dqp)
160{ 160{
161 ASSERT(! XFS_DQ_IS_ON_FREELIST(dqp)); 161 ASSERT(list_empty(&dqp->q_freelist));
162 162
163 mutex_destroy(&dqp->q_qlock); 163 mutex_destroy(&dqp->q_qlock);
164 sv_destroy(&dqp->q_pinwait); 164 sv_destroy(&dqp->q_pinwait);
@@ -252,7 +252,7 @@ xfs_qm_adjust_dqtimers(
252 (be64_to_cpu(d->d_bcount) >= 252 (be64_to_cpu(d->d_bcount) >=
253 be64_to_cpu(d->d_blk_hardlimit)))) { 253 be64_to_cpu(d->d_blk_hardlimit)))) {
254 d->d_btimer = cpu_to_be32(get_seconds() + 254 d->d_btimer = cpu_to_be32(get_seconds() +
255 XFS_QI_BTIMELIMIT(mp)); 255 mp->m_quotainfo->qi_btimelimit);
256 } else { 256 } else {
257 d->d_bwarns = 0; 257 d->d_bwarns = 0;
258 } 258 }
@@ -275,7 +275,7 @@ xfs_qm_adjust_dqtimers(
275 (be64_to_cpu(d->d_icount) >= 275 (be64_to_cpu(d->d_icount) >=
276 be64_to_cpu(d->d_ino_hardlimit)))) { 276 be64_to_cpu(d->d_ino_hardlimit)))) {
277 d->d_itimer = cpu_to_be32(get_seconds() + 277 d->d_itimer = cpu_to_be32(get_seconds() +
278 XFS_QI_ITIMELIMIT(mp)); 278 mp->m_quotainfo->qi_itimelimit);
279 } else { 279 } else {
280 d->d_iwarns = 0; 280 d->d_iwarns = 0;
281 } 281 }
@@ -298,7 +298,7 @@ xfs_qm_adjust_dqtimers(
298 (be64_to_cpu(d->d_rtbcount) >= 298 (be64_to_cpu(d->d_rtbcount) >=
299 be64_to_cpu(d->d_rtb_hardlimit)))) { 299 be64_to_cpu(d->d_rtb_hardlimit)))) {
300 d->d_rtbtimer = cpu_to_be32(get_seconds() + 300 d->d_rtbtimer = cpu_to_be32(get_seconds() +
301 XFS_QI_RTBTIMELIMIT(mp)); 301 mp->m_quotainfo->qi_rtbtimelimit);
302 } else { 302 } else {
303 d->d_rtbwarns = 0; 303 d->d_rtbwarns = 0;
304 } 304 }
@@ -325,6 +325,7 @@ xfs_qm_init_dquot_blk(
325 uint type, 325 uint type,
326 xfs_buf_t *bp) 326 xfs_buf_t *bp)
327{ 327{
328 struct xfs_quotainfo *q = mp->m_quotainfo;
328 xfs_dqblk_t *d; 329 xfs_dqblk_t *d;
329 int curid, i; 330 int curid, i;
330 331
@@ -337,16 +338,16 @@ xfs_qm_init_dquot_blk(
337 /* 338 /*
338 * ID of the first dquot in the block - id's are zero based. 339 * ID of the first dquot in the block - id's are zero based.
339 */ 340 */
340 curid = id - (id % XFS_QM_DQPERBLK(mp)); 341 curid = id - (id % q->qi_dqperchunk);
341 ASSERT(curid >= 0); 342 ASSERT(curid >= 0);
342 memset(d, 0, BBTOB(XFS_QI_DQCHUNKLEN(mp))); 343 memset(d, 0, BBTOB(q->qi_dqchunklen));
343 for (i = 0; i < XFS_QM_DQPERBLK(mp); i++, d++, curid++) 344 for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++)
344 xfs_qm_dqinit_core(curid, type, d); 345 xfs_qm_dqinit_core(curid, type, d);
345 xfs_trans_dquot_buf(tp, bp, 346 xfs_trans_dquot_buf(tp, bp,
346 (type & XFS_DQ_USER ? XFS_BLI_UDQUOT_BUF : 347 (type & XFS_DQ_USER ? XFS_BLF_UDQUOT_BUF :
347 ((type & XFS_DQ_PROJ) ? XFS_BLI_PDQUOT_BUF : 348 ((type & XFS_DQ_PROJ) ? XFS_BLF_PDQUOT_BUF :
348 XFS_BLI_GDQUOT_BUF))); 349 XFS_BLF_GDQUOT_BUF)));
349 xfs_trans_log_buf(tp, bp, 0, BBTOB(XFS_QI_DQCHUNKLEN(mp)) - 1); 350 xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1);
350} 351}
351 352
352 353
@@ -419,7 +420,7 @@ xfs_qm_dqalloc(
419 /* now we can just get the buffer (there's nothing to read yet) */ 420 /* now we can just get the buffer (there's nothing to read yet) */
420 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, 421 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
421 dqp->q_blkno, 422 dqp->q_blkno,
422 XFS_QI_DQCHUNKLEN(mp), 423 mp->m_quotainfo->qi_dqchunklen,
423 0); 424 0);
424 if (!bp || (error = XFS_BUF_GETERROR(bp))) 425 if (!bp || (error = XFS_BUF_GETERROR(bp)))
425 goto error1; 426 goto error1;
@@ -500,7 +501,8 @@ xfs_qm_dqtobp(
500 */ 501 */
501 if (dqp->q_blkno == (xfs_daddr_t) 0) { 502 if (dqp->q_blkno == (xfs_daddr_t) 0) {
502 /* We use the id as an index */ 503 /* We use the id as an index */
503 dqp->q_fileoffset = (xfs_fileoff_t)id / XFS_QM_DQPERBLK(mp); 504 dqp->q_fileoffset = (xfs_fileoff_t)id /
505 mp->m_quotainfo->qi_dqperchunk;
504 nmaps = 1; 506 nmaps = 1;
505 quotip = XFS_DQ_TO_QIP(dqp); 507 quotip = XFS_DQ_TO_QIP(dqp);
506 xfs_ilock(quotip, XFS_ILOCK_SHARED); 508 xfs_ilock(quotip, XFS_ILOCK_SHARED);
@@ -529,7 +531,7 @@ xfs_qm_dqtobp(
529 /* 531 /*
530 * offset of dquot in the (fixed sized) dquot chunk. 532 * offset of dquot in the (fixed sized) dquot chunk.
531 */ 533 */
532 dqp->q_bufoffset = (id % XFS_QM_DQPERBLK(mp)) * 534 dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) *
533 sizeof(xfs_dqblk_t); 535 sizeof(xfs_dqblk_t);
534 if (map.br_startblock == HOLESTARTBLOCK) { 536 if (map.br_startblock == HOLESTARTBLOCK) {
535 /* 537 /*
@@ -559,15 +561,13 @@ xfs_qm_dqtobp(
559 * Read in the buffer, unless we've just done the allocation 561 * Read in the buffer, unless we've just done the allocation
560 * (in which case we already have the buf). 562 * (in which case we already have the buf).
561 */ 563 */
562 if (! newdquot) { 564 if (!newdquot) {
563 trace_xfs_dqtobp_read(dqp); 565 trace_xfs_dqtobp_read(dqp);
564 566
565 if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, 567 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
566 dqp->q_blkno, 568 dqp->q_blkno,
567 XFS_QI_DQCHUNKLEN(mp), 569 mp->m_quotainfo->qi_dqchunklen,
568 0, &bp))) { 570 0, &bp);
569 return (error);
570 }
571 if (error || !bp) 571 if (error || !bp)
572 return XFS_ERROR(error); 572 return XFS_ERROR(error);
573 } 573 }
@@ -689,14 +689,14 @@ xfs_qm_idtodq(
689 tp = NULL; 689 tp = NULL;
690 if (flags & XFS_QMOPT_DQALLOC) { 690 if (flags & XFS_QMOPT_DQALLOC) {
691 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC); 691 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC);
692 if ((error = xfs_trans_reserve(tp, 692 error = xfs_trans_reserve(tp, XFS_QM_DQALLOC_SPACE_RES(mp),
693 XFS_QM_DQALLOC_SPACE_RES(mp), 693 XFS_WRITE_LOG_RES(mp) +
694 XFS_WRITE_LOG_RES(mp) + 694 BBTOB(mp->m_quotainfo->qi_dqchunklen) - 1 +
695 BBTOB(XFS_QI_DQCHUNKLEN(mp)) - 1 + 695 128,
696 128, 696 0,
697 0, 697 XFS_TRANS_PERM_LOG_RES,
698 XFS_TRANS_PERM_LOG_RES, 698 XFS_WRITE_LOG_COUNT);
699 XFS_WRITE_LOG_COUNT))) { 699 if (error) {
700 cancelflags = 0; 700 cancelflags = 0;
701 goto error0; 701 goto error0;
702 } 702 }
@@ -751,7 +751,6 @@ xfs_qm_dqlookup(
751{ 751{
752 xfs_dquot_t *dqp; 752 xfs_dquot_t *dqp;
753 uint flist_locked; 753 uint flist_locked;
754 xfs_dquot_t *d;
755 754
756 ASSERT(mutex_is_locked(&qh->qh_lock)); 755 ASSERT(mutex_is_locked(&qh->qh_lock));
757 756
@@ -760,7 +759,7 @@ xfs_qm_dqlookup(
760 /* 759 /*
761 * Traverse the hashchain looking for a match 760 * Traverse the hashchain looking for a match
762 */ 761 */
763 for (dqp = qh->qh_next; dqp != NULL; dqp = dqp->HL_NEXT) { 762 list_for_each_entry(dqp, &qh->qh_list, q_hashlist) {
764 /* 763 /*
765 * We already have the hashlock. We don't need the 764 * We already have the hashlock. We don't need the
766 * dqlock to look at the id field of the dquot, since the 765 * dqlock to look at the id field of the dquot, since the
@@ -772,12 +771,12 @@ xfs_qm_dqlookup(
772 /* 771 /*
773 * All in core dquots must be on the dqlist of mp 772 * All in core dquots must be on the dqlist of mp
774 */ 773 */
775 ASSERT(dqp->MPL_PREVP != NULL); 774 ASSERT(!list_empty(&dqp->q_mplist));
776 775
777 xfs_dqlock(dqp); 776 xfs_dqlock(dqp);
778 if (dqp->q_nrefs == 0) { 777 if (dqp->q_nrefs == 0) {
779 ASSERT (XFS_DQ_IS_ON_FREELIST(dqp)); 778 ASSERT(!list_empty(&dqp->q_freelist));
780 if (! xfs_qm_freelist_lock_nowait(xfs_Gqm)) { 779 if (!mutex_trylock(&xfs_Gqm->qm_dqfrlist_lock)) {
781 trace_xfs_dqlookup_want(dqp); 780 trace_xfs_dqlookup_want(dqp);
782 781
783 /* 782 /*
@@ -787,7 +786,7 @@ xfs_qm_dqlookup(
787 */ 786 */
788 dqp->dq_flags |= XFS_DQ_WANT; 787 dqp->dq_flags |= XFS_DQ_WANT;
789 xfs_dqunlock(dqp); 788 xfs_dqunlock(dqp);
790 xfs_qm_freelist_lock(xfs_Gqm); 789 mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
791 xfs_dqlock(dqp); 790 xfs_dqlock(dqp);
792 dqp->dq_flags &= ~(XFS_DQ_WANT); 791 dqp->dq_flags &= ~(XFS_DQ_WANT);
793 } 792 }
@@ -802,46 +801,28 @@ xfs_qm_dqlookup(
802 801
803 if (flist_locked) { 802 if (flist_locked) {
804 if (dqp->q_nrefs != 0) { 803 if (dqp->q_nrefs != 0) {
805 xfs_qm_freelist_unlock(xfs_Gqm); 804 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
806 flist_locked = B_FALSE; 805 flist_locked = B_FALSE;
807 } else { 806 } else {
808 /* 807 /* take it off the freelist */
809 * take it off the freelist
810 */
811 trace_xfs_dqlookup_freelist(dqp); 808 trace_xfs_dqlookup_freelist(dqp);
812 XQM_FREELIST_REMOVE(dqp); 809 list_del_init(&dqp->q_freelist);
813 /* xfs_qm_freelist_print(&(xfs_Gqm-> 810 xfs_Gqm->qm_dqfrlist_cnt--;
814 qm_dqfreelist),
815 "after removal"); */
816 } 811 }
817 } 812 }
818 813
819 /*
820 * grab a reference
821 */
822 XFS_DQHOLD(dqp); 814 XFS_DQHOLD(dqp);
823 815
824 if (flist_locked) 816 if (flist_locked)
825 xfs_qm_freelist_unlock(xfs_Gqm); 817 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
826 /* 818 /*
827 * move the dquot to the front of the hashchain 819 * move the dquot to the front of the hashchain
828 */ 820 */
829 ASSERT(mutex_is_locked(&qh->qh_lock)); 821 ASSERT(mutex_is_locked(&qh->qh_lock));
830 if (dqp->HL_PREVP != &qh->qh_next) { 822 list_move(&dqp->q_hashlist, &qh->qh_list);
831 trace_xfs_dqlookup_move(dqp);
832 if ((d = dqp->HL_NEXT))
833 d->HL_PREVP = dqp->HL_PREVP;
834 *(dqp->HL_PREVP) = d;
835 d = qh->qh_next;
836 d->HL_PREVP = &dqp->HL_NEXT;
837 dqp->HL_NEXT = d;
838 dqp->HL_PREVP = &qh->qh_next;
839 qh->qh_next = dqp;
840 }
841 trace_xfs_dqlookup_done(dqp); 823 trace_xfs_dqlookup_done(dqp);
842 *O_dqpp = dqp; 824 *O_dqpp = dqp;
843 ASSERT(mutex_is_locked(&qh->qh_lock)); 825 return 0;
844 return (0);
845 } 826 }
846 } 827 }
847 828
@@ -975,16 +956,17 @@ xfs_qm_dqget(
975 */ 956 */
976 if (ip) { 957 if (ip) {
977 xfs_ilock(ip, XFS_ILOCK_EXCL); 958 xfs_ilock(ip, XFS_ILOCK_EXCL);
978 if (! XFS_IS_DQTYPE_ON(mp, type)) { 959
979 /* inode stays locked on return */
980 xfs_qm_dqdestroy(dqp);
981 return XFS_ERROR(ESRCH);
982 }
983 /* 960 /*
984 * A dquot could be attached to this inode by now, since 961 * A dquot could be attached to this inode by now, since
985 * we had dropped the ilock. 962 * we had dropped the ilock.
986 */ 963 */
987 if (type == XFS_DQ_USER) { 964 if (type == XFS_DQ_USER) {
965 if (!XFS_IS_UQUOTA_ON(mp)) {
966 /* inode stays locked on return */
967 xfs_qm_dqdestroy(dqp);
968 return XFS_ERROR(ESRCH);
969 }
988 if (ip->i_udquot) { 970 if (ip->i_udquot) {
989 xfs_qm_dqdestroy(dqp); 971 xfs_qm_dqdestroy(dqp);
990 dqp = ip->i_udquot; 972 dqp = ip->i_udquot;
@@ -992,6 +974,11 @@ xfs_qm_dqget(
992 goto dqret; 974 goto dqret;
993 } 975 }
994 } else { 976 } else {
977 if (!XFS_IS_OQUOTA_ON(mp)) {
978 /* inode stays locked on return */
979 xfs_qm_dqdestroy(dqp);
980 return XFS_ERROR(ESRCH);
981 }
995 if (ip->i_gdquot) { 982 if (ip->i_gdquot) {
996 xfs_qm_dqdestroy(dqp); 983 xfs_qm_dqdestroy(dqp);
997 dqp = ip->i_gdquot; 984 dqp = ip->i_gdquot;
@@ -1033,13 +1020,14 @@ xfs_qm_dqget(
1033 */ 1020 */
1034 ASSERT(mutex_is_locked(&h->qh_lock)); 1021 ASSERT(mutex_is_locked(&h->qh_lock));
1035 dqp->q_hash = h; 1022 dqp->q_hash = h;
1036 XQM_HASHLIST_INSERT(h, dqp); 1023 list_add(&dqp->q_hashlist, &h->qh_list);
1024 h->qh_version++;
1037 1025
1038 /* 1026 /*
1039 * Attach this dquot to this filesystem's list of all dquots, 1027 * Attach this dquot to this filesystem's list of all dquots,
1040 * kept inside the mount structure in m_quotainfo field 1028 * kept inside the mount structure in m_quotainfo field
1041 */ 1029 */
1042 xfs_qm_mplist_lock(mp); 1030 mutex_lock(&mp->m_quotainfo->qi_dqlist_lock);
1043 1031
1044 /* 1032 /*
1045 * We return a locked dquot to the caller, with a reference taken 1033 * We return a locked dquot to the caller, with a reference taken
@@ -1047,9 +1035,9 @@ xfs_qm_dqget(
1047 xfs_dqlock(dqp); 1035 xfs_dqlock(dqp);
1048 dqp->q_nrefs = 1; 1036 dqp->q_nrefs = 1;
1049 1037
1050 XQM_MPLIST_INSERT(&(XFS_QI_MPL_LIST(mp)), dqp); 1038 list_add(&dqp->q_mplist, &mp->m_quotainfo->qi_dqlist);
1051 1039 mp->m_quotainfo->qi_dquots++;
1052 xfs_qm_mplist_unlock(mp); 1040 mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock);
1053 mutex_unlock(&h->qh_lock); 1041 mutex_unlock(&h->qh_lock);
1054 dqret: 1042 dqret:
1055 ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL)); 1043 ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL));
@@ -1086,10 +1074,10 @@ xfs_qm_dqput(
1086 * drop the dqlock and acquire the freelist and dqlock 1074 * drop the dqlock and acquire the freelist and dqlock
1087 * in the right order; but try to get it out-of-order first 1075 * in the right order; but try to get it out-of-order first
1088 */ 1076 */
1089 if (! xfs_qm_freelist_lock_nowait(xfs_Gqm)) { 1077 if (!mutex_trylock(&xfs_Gqm->qm_dqfrlist_lock)) {
1090 trace_xfs_dqput_wait(dqp); 1078 trace_xfs_dqput_wait(dqp);
1091 xfs_dqunlock(dqp); 1079 xfs_dqunlock(dqp);
1092 xfs_qm_freelist_lock(xfs_Gqm); 1080 mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
1093 xfs_dqlock(dqp); 1081 xfs_dqlock(dqp);
1094 } 1082 }
1095 1083
@@ -1100,10 +1088,8 @@ xfs_qm_dqput(
1100 if (--dqp->q_nrefs == 0) { 1088 if (--dqp->q_nrefs == 0) {
1101 trace_xfs_dqput_free(dqp); 1089 trace_xfs_dqput_free(dqp);
1102 1090
1103 /* 1091 list_add_tail(&dqp->q_freelist, &xfs_Gqm->qm_dqfrlist);
1104 * insert at end of the freelist. 1092 xfs_Gqm->qm_dqfrlist_cnt++;
1105 */
1106 XQM_FREELIST_INSERT(&(xfs_Gqm->qm_dqfreelist), dqp);
1107 1093
1108 /* 1094 /*
1109 * If we just added a udquot to the freelist, then 1095 * If we just added a udquot to the freelist, then
@@ -1118,10 +1104,6 @@ xfs_qm_dqput(
1118 xfs_dqlock(gdqp); 1104 xfs_dqlock(gdqp);
1119 dqp->q_gdquot = NULL; 1105 dqp->q_gdquot = NULL;
1120 } 1106 }
1121
1122 /* xfs_qm_freelist_print(&(xfs_Gqm->qm_dqfreelist),
1123 "@@@@@++ Free list (after append) @@@@@+");
1124 */
1125 } 1107 }
1126 xfs_dqunlock(dqp); 1108 xfs_dqunlock(dqp);
1127 1109
@@ -1133,7 +1115,7 @@ xfs_qm_dqput(
1133 break; 1115 break;
1134 dqp = gdqp; 1116 dqp = gdqp;
1135 } 1117 }
1136 xfs_qm_freelist_unlock(xfs_Gqm); 1118 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
1137} 1119}
1138 1120
1139/* 1121/*
@@ -1386,10 +1368,10 @@ int
1386xfs_qm_dqpurge( 1368xfs_qm_dqpurge(
1387 xfs_dquot_t *dqp) 1369 xfs_dquot_t *dqp)
1388{ 1370{
1389 xfs_dqhash_t *thishash; 1371 xfs_dqhash_t *qh = dqp->q_hash;
1390 xfs_mount_t *mp = dqp->q_mount; 1372 xfs_mount_t *mp = dqp->q_mount;
1391 1373
1392 ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp)); 1374 ASSERT(mutex_is_locked(&mp->m_quotainfo->qi_dqlist_lock));
1393 ASSERT(mutex_is_locked(&dqp->q_hash->qh_lock)); 1375 ASSERT(mutex_is_locked(&dqp->q_hash->qh_lock));
1394 1376
1395 xfs_dqlock(dqp); 1377 xfs_dqlock(dqp);
@@ -1407,7 +1389,7 @@ xfs_qm_dqpurge(
1407 return (1); 1389 return (1);
1408 } 1390 }
1409 1391
1410 ASSERT(XFS_DQ_IS_ON_FREELIST(dqp)); 1392 ASSERT(!list_empty(&dqp->q_freelist));
1411 1393
1412 /* 1394 /*
1413 * If we're turning off quotas, we have to make sure that, for 1395 * If we're turning off quotas, we have to make sure that, for
@@ -1452,14 +1434,16 @@ xfs_qm_dqpurge(
1452 ASSERT(XFS_FORCED_SHUTDOWN(mp) || 1434 ASSERT(XFS_FORCED_SHUTDOWN(mp) ||
1453 !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL)); 1435 !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL));
1454 1436
1455 thishash = dqp->q_hash; 1437 list_del_init(&dqp->q_hashlist);
1456 XQM_HASHLIST_REMOVE(thishash, dqp); 1438 qh->qh_version++;
1457 XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(mp)), dqp); 1439 list_del_init(&dqp->q_mplist);
1440 mp->m_quotainfo->qi_dqreclaims++;
1441 mp->m_quotainfo->qi_dquots--;
1458 /* 1442 /*
1459 * XXX Move this to the front of the freelist, if we can get the 1443 * XXX Move this to the front of the freelist, if we can get the
1460 * freelist lock. 1444 * freelist lock.
1461 */ 1445 */
1462 ASSERT(XFS_DQ_IS_ON_FREELIST(dqp)); 1446 ASSERT(!list_empty(&dqp->q_freelist));
1463 1447
1464 dqp->q_mount = NULL; 1448 dqp->q_mount = NULL;
1465 dqp->q_hash = NULL; 1449 dqp->q_hash = NULL;
@@ -1467,7 +1451,7 @@ xfs_qm_dqpurge(
1467 memset(&dqp->q_core, 0, sizeof(dqp->q_core)); 1451 memset(&dqp->q_core, 0, sizeof(dqp->q_core));
1468 xfs_dqfunlock(dqp); 1452 xfs_dqfunlock(dqp);
1469 xfs_dqunlock(dqp); 1453 xfs_dqunlock(dqp);
1470 mutex_unlock(&thishash->qh_lock); 1454 mutex_unlock(&qh->qh_lock);
1471 return (0); 1455 return (0);
1472} 1456}
1473 1457
@@ -1517,6 +1501,7 @@ void
1517xfs_qm_dqflock_pushbuf_wait( 1501xfs_qm_dqflock_pushbuf_wait(
1518 xfs_dquot_t *dqp) 1502 xfs_dquot_t *dqp)
1519{ 1503{
1504 xfs_mount_t *mp = dqp->q_mount;
1520 xfs_buf_t *bp; 1505 xfs_buf_t *bp;
1521 1506
1522 /* 1507 /*
@@ -1525,14 +1510,14 @@ xfs_qm_dqflock_pushbuf_wait(
1525 * out immediately. We'll be able to acquire 1510 * out immediately. We'll be able to acquire
1526 * the flush lock when the I/O completes. 1511 * the flush lock when the I/O completes.
1527 */ 1512 */
1528 bp = xfs_incore(dqp->q_mount->m_ddev_targp, dqp->q_blkno, 1513 bp = xfs_incore(mp->m_ddev_targp, dqp->q_blkno,
1529 XFS_QI_DQCHUNKLEN(dqp->q_mount), XBF_TRYLOCK); 1514 mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK);
1530 if (!bp) 1515 if (!bp)
1531 goto out_lock; 1516 goto out_lock;
1532 1517
1533 if (XFS_BUF_ISDELAYWRITE(bp)) { 1518 if (XFS_BUF_ISDELAYWRITE(bp)) {
1534 if (XFS_BUF_ISPINNED(bp)) 1519 if (XFS_BUF_ISPINNED(bp))
1535 xfs_log_force(dqp->q_mount, 0); 1520 xfs_log_force(mp, 0);
1536 xfs_buf_delwri_promote(bp); 1521 xfs_buf_delwri_promote(bp);
1537 wake_up_process(bp->b_target->bt_task); 1522 wake_up_process(bp->b_target->bt_task);
1538 } 1523 }
diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h
index a0f7da586d1b..5da3a23b820d 100644
--- a/fs/xfs/quota/xfs_dquot.h
+++ b/fs/xfs/quota/xfs_dquot.h
@@ -33,40 +33,23 @@
33 * The hash chain headers (hash buckets) 33 * The hash chain headers (hash buckets)
34 */ 34 */
35typedef struct xfs_dqhash { 35typedef struct xfs_dqhash {
36 struct xfs_dquot *qh_next; 36 struct list_head qh_list;
37 struct mutex qh_lock; 37 struct mutex qh_lock;
38 uint qh_version; /* ever increasing version */ 38 uint qh_version; /* ever increasing version */
39 uint qh_nelems; /* number of dquots on the list */ 39 uint qh_nelems; /* number of dquots on the list */
40} xfs_dqhash_t; 40} xfs_dqhash_t;
41 41
42typedef struct xfs_dqlink {
43 struct xfs_dquot *ql_next; /* forward link */
44 struct xfs_dquot **ql_prevp; /* pointer to prev ql_next */
45} xfs_dqlink_t;
46
47struct xfs_mount; 42struct xfs_mount;
48struct xfs_trans; 43struct xfs_trans;
49 44
50/* 45/*
51 * This is the marker which is designed to occupy the first few
52 * bytes of the xfs_dquot_t structure. Even inside this, the freelist pointers
53 * must come first.
54 * This serves as the marker ("sentinel") when we have to restart list
55 * iterations because of locking considerations.
56 */
57typedef struct xfs_dqmarker {
58 struct xfs_dquot*dqm_flnext; /* link to freelist: must be first */
59 struct xfs_dquot*dqm_flprev;
60 xfs_dqlink_t dqm_mplist; /* link to mount's list of dquots */
61 xfs_dqlink_t dqm_hashlist; /* link to the hash chain */
62 uint dqm_flags; /* various flags (XFS_DQ_*) */
63} xfs_dqmarker_t;
64
65/*
66 * The incore dquot structure 46 * The incore dquot structure
67 */ 47 */
68typedef struct xfs_dquot { 48typedef struct xfs_dquot {
69 xfs_dqmarker_t q_lists; /* list ptrs, q_flags (marker) */ 49 uint dq_flags; /* various flags (XFS_DQ_*) */
50 struct list_head q_freelist; /* global free list of dquots */
51 struct list_head q_mplist; /* mount's list of dquots */
52 struct list_head q_hashlist; /* gloabl hash list of dquots */
70 xfs_dqhash_t *q_hash; /* the hashchain header */ 53 xfs_dqhash_t *q_hash; /* the hashchain header */
71 struct xfs_mount*q_mount; /* filesystem this relates to */ 54 struct xfs_mount*q_mount; /* filesystem this relates to */
72 struct xfs_trans*q_transp; /* trans this belongs to currently */ 55 struct xfs_trans*q_transp; /* trans this belongs to currently */
@@ -87,13 +70,6 @@ typedef struct xfs_dquot {
87 wait_queue_head_t q_pinwait; /* dquot pinning wait queue */ 70 wait_queue_head_t q_pinwait; /* dquot pinning wait queue */
88} xfs_dquot_t; 71} xfs_dquot_t;
89 72
90
91#define dq_flnext q_lists.dqm_flnext
92#define dq_flprev q_lists.dqm_flprev
93#define dq_mplist q_lists.dqm_mplist
94#define dq_hashlist q_lists.dqm_hashlist
95#define dq_flags q_lists.dqm_flags
96
97/* 73/*
98 * Lock hierarchy for q_qlock: 74 * Lock hierarchy for q_qlock:
99 * XFS_QLOCK_NORMAL is the implicit default, 75 * XFS_QLOCK_NORMAL is the implicit default,
@@ -127,7 +103,6 @@ static inline void xfs_dqfunlock(xfs_dquot_t *dqp)
127} 103}
128 104
129#define XFS_DQ_IS_LOCKED(dqp) (mutex_is_locked(&((dqp)->q_qlock))) 105#define XFS_DQ_IS_LOCKED(dqp) (mutex_is_locked(&((dqp)->q_qlock)))
130#define XFS_DQ_IS_ON_FREELIST(dqp) ((dqp)->dq_flnext != (dqp))
131#define XFS_DQ_IS_DIRTY(dqp) ((dqp)->dq_flags & XFS_DQ_DIRTY) 106#define XFS_DQ_IS_DIRTY(dqp) ((dqp)->dq_flags & XFS_DQ_DIRTY)
132#define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER) 107#define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER)
133#define XFS_QM_ISPDQ(dqp) ((dqp)->dq_flags & XFS_DQ_PROJ) 108#define XFS_QM_ISPDQ(dqp) ((dqp)->dq_flags & XFS_DQ_PROJ)
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index 4e4ee9a57194..8d89a24ae324 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -107,8 +107,7 @@ xfs_qm_dquot_logitem_pin(
107/* ARGSUSED */ 107/* ARGSUSED */
108STATIC void 108STATIC void
109xfs_qm_dquot_logitem_unpin( 109xfs_qm_dquot_logitem_unpin(
110 xfs_dq_logitem_t *logitem, 110 xfs_dq_logitem_t *logitem)
111 int stale)
112{ 111{
113 xfs_dquot_t *dqp = logitem->qli_dquot; 112 xfs_dquot_t *dqp = logitem->qli_dquot;
114 113
@@ -123,7 +122,7 @@ xfs_qm_dquot_logitem_unpin_remove(
123 xfs_dq_logitem_t *logitem, 122 xfs_dq_logitem_t *logitem,
124 xfs_trans_t *tp) 123 xfs_trans_t *tp)
125{ 124{
126 xfs_qm_dquot_logitem_unpin(logitem, 0); 125 xfs_qm_dquot_logitem_unpin(logitem);
127} 126}
128 127
129/* 128/*
@@ -228,7 +227,7 @@ xfs_qm_dquot_logitem_pushbuf(
228 } 227 }
229 mp = dqp->q_mount; 228 mp = dqp->q_mount;
230 bp = xfs_incore(mp->m_ddev_targp, qip->qli_format.qlf_blkno, 229 bp = xfs_incore(mp->m_ddev_targp, qip->qli_format.qlf_blkno,
231 XFS_QI_DQCHUNKLEN(mp), XBF_TRYLOCK); 230 mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK);
232 xfs_dqunlock(dqp); 231 xfs_dqunlock(dqp);
233 if (!bp) 232 if (!bp)
234 return; 233 return;
@@ -329,8 +328,7 @@ static struct xfs_item_ops xfs_dquot_item_ops = {
329 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) 328 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
330 xfs_qm_dquot_logitem_format, 329 xfs_qm_dquot_logitem_format,
331 .iop_pin = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_pin, 330 .iop_pin = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_pin,
332 .iop_unpin = (void(*)(xfs_log_item_t*, int)) 331 .iop_unpin = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_unpin,
333 xfs_qm_dquot_logitem_unpin,
334 .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*)) 332 .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*))
335 xfs_qm_dquot_logitem_unpin_remove, 333 xfs_qm_dquot_logitem_unpin_remove,
336 .iop_trylock = (uint(*)(xfs_log_item_t*)) 334 .iop_trylock = (uint(*)(xfs_log_item_t*))
@@ -357,9 +355,8 @@ xfs_qm_dquot_logitem_init(
357 xfs_dq_logitem_t *lp; 355 xfs_dq_logitem_t *lp;
358 lp = &dqp->q_logitem; 356 lp = &dqp->q_logitem;
359 357
360 lp->qli_item.li_type = XFS_LI_DQUOT; 358 xfs_log_item_init(dqp->q_mount, &lp->qli_item, XFS_LI_DQUOT,
361 lp->qli_item.li_ops = &xfs_dquot_item_ops; 359 &xfs_dquot_item_ops);
362 lp->qli_item.li_mountp = dqp->q_mount;
363 lp->qli_dquot = dqp; 360 lp->qli_dquot = dqp;
364 lp->qli_format.qlf_type = XFS_LI_DQUOT; 361 lp->qli_format.qlf_type = XFS_LI_DQUOT;
365 lp->qli_format.qlf_id = be32_to_cpu(dqp->q_core.d_id); 362 lp->qli_format.qlf_id = be32_to_cpu(dqp->q_core.d_id);
@@ -426,7 +423,7 @@ xfs_qm_qoff_logitem_pin(xfs_qoff_logitem_t *qf)
426 */ 423 */
427/*ARGSUSED*/ 424/*ARGSUSED*/
428STATIC void 425STATIC void
429xfs_qm_qoff_logitem_unpin(xfs_qoff_logitem_t *qf, int stale) 426xfs_qm_qoff_logitem_unpin(xfs_qoff_logitem_t *qf)
430{ 427{
431 return; 428 return;
432} 429}
@@ -537,8 +534,7 @@ static struct xfs_item_ops xfs_qm_qoffend_logitem_ops = {
537 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) 534 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
538 xfs_qm_qoff_logitem_format, 535 xfs_qm_qoff_logitem_format,
539 .iop_pin = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_pin, 536 .iop_pin = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_pin,
540 .iop_unpin = (void(*)(xfs_log_item_t* ,int)) 537 .iop_unpin = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unpin,
541 xfs_qm_qoff_logitem_unpin,
542 .iop_unpin_remove = (void(*)(xfs_log_item_t*,xfs_trans_t*)) 538 .iop_unpin_remove = (void(*)(xfs_log_item_t*,xfs_trans_t*))
543 xfs_qm_qoff_logitem_unpin_remove, 539 xfs_qm_qoff_logitem_unpin_remove,
544 .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_trylock, 540 .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_trylock,
@@ -559,8 +555,7 @@ static struct xfs_item_ops xfs_qm_qoff_logitem_ops = {
559 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) 555 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
560 xfs_qm_qoff_logitem_format, 556 xfs_qm_qoff_logitem_format,
561 .iop_pin = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_pin, 557 .iop_pin = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_pin,
562 .iop_unpin = (void(*)(xfs_log_item_t*, int)) 558 .iop_unpin = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unpin,
563 xfs_qm_qoff_logitem_unpin,
564 .iop_unpin_remove = (void(*)(xfs_log_item_t*,xfs_trans_t*)) 559 .iop_unpin_remove = (void(*)(xfs_log_item_t*,xfs_trans_t*))
565 xfs_qm_qoff_logitem_unpin_remove, 560 xfs_qm_qoff_logitem_unpin_remove,
566 .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_trylock, 561 .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_trylock,
@@ -586,11 +581,8 @@ xfs_qm_qoff_logitem_init(
586 581
587 qf = (xfs_qoff_logitem_t*) kmem_zalloc(sizeof(xfs_qoff_logitem_t), KM_SLEEP); 582 qf = (xfs_qoff_logitem_t*) kmem_zalloc(sizeof(xfs_qoff_logitem_t), KM_SLEEP);
588 583
589 qf->qql_item.li_type = XFS_LI_QUOTAOFF; 584 xfs_log_item_init(mp, &qf->qql_item, XFS_LI_QUOTAOFF, start ?
590 if (start) 585 &xfs_qm_qoffend_logitem_ops : &xfs_qm_qoff_logitem_ops);
591 qf->qql_item.li_ops = &xfs_qm_qoffend_logitem_ops;
592 else
593 qf->qql_item.li_ops = &xfs_qm_qoff_logitem_ops;
594 qf->qql_item.li_mountp = mp; 586 qf->qql_item.li_mountp = mp;
595 qf->qql_format.qf_type = XFS_LI_QUOTAOFF; 587 qf->qql_format.qf_type = XFS_LI_QUOTAOFF;
596 qf->qql_format.qf_flags = flags; 588 qf->qql_format.qf_flags = flags;
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 417e61e3d9dd..38e764146644 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -67,9 +67,6 @@ static cred_t xfs_zerocr;
67STATIC void xfs_qm_list_init(xfs_dqlist_t *, char *, int); 67STATIC void xfs_qm_list_init(xfs_dqlist_t *, char *, int);
68STATIC void xfs_qm_list_destroy(xfs_dqlist_t *); 68STATIC void xfs_qm_list_destroy(xfs_dqlist_t *);
69 69
70STATIC void xfs_qm_freelist_init(xfs_frlist_t *);
71STATIC void xfs_qm_freelist_destroy(xfs_frlist_t *);
72
73STATIC int xfs_qm_init_quotainos(xfs_mount_t *); 70STATIC int xfs_qm_init_quotainos(xfs_mount_t *);
74STATIC int xfs_qm_init_quotainfo(xfs_mount_t *); 71STATIC int xfs_qm_init_quotainfo(xfs_mount_t *);
75STATIC int xfs_qm_shake(int, gfp_t); 72STATIC int xfs_qm_shake(int, gfp_t);
@@ -84,21 +81,25 @@ extern struct mutex qcheck_lock;
84#endif 81#endif
85 82
86#ifdef QUOTADEBUG 83#ifdef QUOTADEBUG
87#define XQM_LIST_PRINT(l, NXT, title) \ 84static void
88{ \ 85xfs_qm_dquot_list_print(
89 xfs_dquot_t *dqp; int i = 0; \ 86 struct xfs_mount *mp)
90 cmn_err(CE_DEBUG, "%s (#%d)", title, (int) (l)->qh_nelems); \ 87{
91 for (dqp = (l)->qh_next; dqp != NULL; dqp = dqp->NXT) { \ 88 xfs_dquot_t *dqp;
92 cmn_err(CE_DEBUG, " %d. \"%d (%s)\" " \ 89 int i = 0;
93 "bcnt = %d, icnt = %d, refs = %d", \ 90
94 ++i, (int) be32_to_cpu(dqp->q_core.d_id), \ 91 list_for_each_entry(dqp, &mp->m_quotainfo->qi_dqlist_lock, qi_mplist) {
95 DQFLAGTO_TYPESTR(dqp), \ 92 cmn_err(CE_DEBUG, " %d. \"%d (%s)\" "
96 (int) be64_to_cpu(dqp->q_core.d_bcount), \ 93 "bcnt = %lld, icnt = %lld, refs = %d",
97 (int) be64_to_cpu(dqp->q_core.d_icount), \ 94 i++, be32_to_cpu(dqp->q_core.d_id),
98 (int) dqp->q_nrefs); } \ 95 DQFLAGTO_TYPESTR(dqp),
96 (long long)be64_to_cpu(dqp->q_core.d_bcount),
97 (long long)be64_to_cpu(dqp->q_core.d_icount),
98 dqp->q_nrefs);
99 }
99} 100}
100#else 101#else
101#define XQM_LIST_PRINT(l, NXT, title) do { } while (0) 102static void xfs_qm_dquot_list_print(struct xfs_mount *mp) { }
102#endif 103#endif
103 104
104/* 105/*
@@ -144,7 +145,9 @@ xfs_Gqm_init(void)
144 /* 145 /*
145 * Freelist of all dquots of all file systems 146 * Freelist of all dquots of all file systems
146 */ 147 */
147 xfs_qm_freelist_init(&(xqm->qm_dqfreelist)); 148 INIT_LIST_HEAD(&xqm->qm_dqfrlist);
149 xqm->qm_dqfrlist_cnt = 0;
150 mutex_init(&xqm->qm_dqfrlist_lock);
148 151
149 /* 152 /*
150 * dquot zone. we register our own low-memory callback. 153 * dquot zone. we register our own low-memory callback.
@@ -189,6 +192,7 @@ STATIC void
189xfs_qm_destroy( 192xfs_qm_destroy(
190 struct xfs_qm *xqm) 193 struct xfs_qm *xqm)
191{ 194{
195 struct xfs_dquot *dqp, *n;
192 int hsize, i; 196 int hsize, i;
193 197
194 ASSERT(xqm != NULL); 198 ASSERT(xqm != NULL);
@@ -204,7 +208,21 @@ xfs_qm_destroy(
204 xqm->qm_usr_dqhtable = NULL; 208 xqm->qm_usr_dqhtable = NULL;
205 xqm->qm_grp_dqhtable = NULL; 209 xqm->qm_grp_dqhtable = NULL;
206 xqm->qm_dqhashmask = 0; 210 xqm->qm_dqhashmask = 0;
207 xfs_qm_freelist_destroy(&(xqm->qm_dqfreelist)); 211
212 /* frlist cleanup */
213 mutex_lock(&xqm->qm_dqfrlist_lock);
214 list_for_each_entry_safe(dqp, n, &xqm->qm_dqfrlist, q_freelist) {
215 xfs_dqlock(dqp);
216#ifdef QUOTADEBUG
217 cmn_err(CE_DEBUG, "FREELIST destroy 0x%p", dqp);
218#endif
219 list_del_init(&dqp->q_freelist);
220 xfs_Gqm->qm_dqfrlist_cnt--;
221 xfs_dqunlock(dqp);
222 xfs_qm_dqdestroy(dqp);
223 }
224 mutex_unlock(&xqm->qm_dqfrlist_lock);
225 mutex_destroy(&xqm->qm_dqfrlist_lock);
208#ifdef DEBUG 226#ifdef DEBUG
209 mutex_destroy(&qcheck_lock); 227 mutex_destroy(&qcheck_lock);
210#endif 228#endif
@@ -256,7 +274,7 @@ STATIC void
256xfs_qm_rele_quotafs_ref( 274xfs_qm_rele_quotafs_ref(
257 struct xfs_mount *mp) 275 struct xfs_mount *mp)
258{ 276{
259 xfs_dquot_t *dqp, *nextdqp; 277 xfs_dquot_t *dqp, *n;
260 278
261 ASSERT(xfs_Gqm); 279 ASSERT(xfs_Gqm);
262 ASSERT(xfs_Gqm->qm_nrefs > 0); 280 ASSERT(xfs_Gqm->qm_nrefs > 0);
@@ -264,26 +282,24 @@ xfs_qm_rele_quotafs_ref(
264 /* 282 /*
265 * Go thru the freelist and destroy all inactive dquots. 283 * Go thru the freelist and destroy all inactive dquots.
266 */ 284 */
267 xfs_qm_freelist_lock(xfs_Gqm); 285 mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
268 286
269 for (dqp = xfs_Gqm->qm_dqfreelist.qh_next; 287 list_for_each_entry_safe(dqp, n, &xfs_Gqm->qm_dqfrlist, q_freelist) {
270 dqp != (xfs_dquot_t *)&(xfs_Gqm->qm_dqfreelist); ) {
271 xfs_dqlock(dqp); 288 xfs_dqlock(dqp);
272 nextdqp = dqp->dq_flnext;
273 if (dqp->dq_flags & XFS_DQ_INACTIVE) { 289 if (dqp->dq_flags & XFS_DQ_INACTIVE) {
274 ASSERT(dqp->q_mount == NULL); 290 ASSERT(dqp->q_mount == NULL);
275 ASSERT(! XFS_DQ_IS_DIRTY(dqp)); 291 ASSERT(! XFS_DQ_IS_DIRTY(dqp));
276 ASSERT(dqp->HL_PREVP == NULL); 292 ASSERT(list_empty(&dqp->q_hashlist));
277 ASSERT(dqp->MPL_PREVP == NULL); 293 ASSERT(list_empty(&dqp->q_mplist));
278 XQM_FREELIST_REMOVE(dqp); 294 list_del_init(&dqp->q_freelist);
295 xfs_Gqm->qm_dqfrlist_cnt--;
279 xfs_dqunlock(dqp); 296 xfs_dqunlock(dqp);
280 xfs_qm_dqdestroy(dqp); 297 xfs_qm_dqdestroy(dqp);
281 } else { 298 } else {
282 xfs_dqunlock(dqp); 299 xfs_dqunlock(dqp);
283 } 300 }
284 dqp = nextdqp;
285 } 301 }
286 xfs_qm_freelist_unlock(xfs_Gqm); 302 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
287 303
288 /* 304 /*
289 * Destroy the entire XQM. If somebody mounts with quotaon, this'll 305 * Destroy the entire XQM. If somebody mounts with quotaon, this'll
@@ -305,7 +321,7 @@ xfs_qm_unmount(
305 struct xfs_mount *mp) 321 struct xfs_mount *mp)
306{ 322{
307 if (mp->m_quotainfo) { 323 if (mp->m_quotainfo) {
308 xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_UMOUNTING); 324 xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL);
309 xfs_qm_destroy_quotainfo(mp); 325 xfs_qm_destroy_quotainfo(mp);
310 } 326 }
311} 327}
@@ -449,20 +465,21 @@ xfs_qm_unmount_quotas(
449 */ 465 */
450STATIC int 466STATIC int
451xfs_qm_dqflush_all( 467xfs_qm_dqflush_all(
452 xfs_mount_t *mp, 468 struct xfs_mount *mp,
453 int sync_mode) 469 int sync_mode)
454{ 470{
455 int recl; 471 struct xfs_quotainfo *q = mp->m_quotainfo;
456 xfs_dquot_t *dqp; 472 int recl;
457 int niters; 473 struct xfs_dquot *dqp;
458 int error; 474 int niters;
475 int error;
459 476
460 if (mp->m_quotainfo == NULL) 477 if (!q)
461 return 0; 478 return 0;
462 niters = 0; 479 niters = 0;
463again: 480again:
464 xfs_qm_mplist_lock(mp); 481 mutex_lock(&q->qi_dqlist_lock);
465 FOREACH_DQUOT_IN_MP(dqp, mp) { 482 list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) {
466 xfs_dqlock(dqp); 483 xfs_dqlock(dqp);
467 if (! XFS_DQ_IS_DIRTY(dqp)) { 484 if (! XFS_DQ_IS_DIRTY(dqp)) {
468 xfs_dqunlock(dqp); 485 xfs_dqunlock(dqp);
@@ -470,7 +487,7 @@ again:
470 } 487 }
471 488
472 /* XXX a sentinel would be better */ 489 /* XXX a sentinel would be better */
473 recl = XFS_QI_MPLRECLAIMS(mp); 490 recl = q->qi_dqreclaims;
474 if (!xfs_dqflock_nowait(dqp)) { 491 if (!xfs_dqflock_nowait(dqp)) {
475 /* 492 /*
476 * If we can't grab the flush lock then check 493 * If we can't grab the flush lock then check
@@ -485,21 +502,21 @@ again:
485 * Let go of the mplist lock. We don't want to hold it 502 * Let go of the mplist lock. We don't want to hold it
486 * across a disk write. 503 * across a disk write.
487 */ 504 */
488 xfs_qm_mplist_unlock(mp); 505 mutex_unlock(&q->qi_dqlist_lock);
489 error = xfs_qm_dqflush(dqp, sync_mode); 506 error = xfs_qm_dqflush(dqp, sync_mode);
490 xfs_dqunlock(dqp); 507 xfs_dqunlock(dqp);
491 if (error) 508 if (error)
492 return error; 509 return error;
493 510
494 xfs_qm_mplist_lock(mp); 511 mutex_lock(&q->qi_dqlist_lock);
495 if (recl != XFS_QI_MPLRECLAIMS(mp)) { 512 if (recl != q->qi_dqreclaims) {
496 xfs_qm_mplist_unlock(mp); 513 mutex_unlock(&q->qi_dqlist_lock);
497 /* XXX restart limit */ 514 /* XXX restart limit */
498 goto again; 515 goto again;
499 } 516 }
500 } 517 }
501 518
502 xfs_qm_mplist_unlock(mp); 519 mutex_unlock(&q->qi_dqlist_lock);
503 /* return ! busy */ 520 /* return ! busy */
504 return 0; 521 return 0;
505} 522}
@@ -509,15 +526,15 @@ again:
509 */ 526 */
510STATIC void 527STATIC void
511xfs_qm_detach_gdquots( 528xfs_qm_detach_gdquots(
512 xfs_mount_t *mp) 529 struct xfs_mount *mp)
513{ 530{
514 xfs_dquot_t *dqp, *gdqp; 531 struct xfs_quotainfo *q = mp->m_quotainfo;
515 int nrecl; 532 struct xfs_dquot *dqp, *gdqp;
533 int nrecl;
516 534
517 again: 535 again:
518 ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp)); 536 ASSERT(mutex_is_locked(&q->qi_dqlist_lock));
519 dqp = XFS_QI_MPLNEXT(mp); 537 list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) {
520 while (dqp) {
521 xfs_dqlock(dqp); 538 xfs_dqlock(dqp);
522 if ((gdqp = dqp->q_gdquot)) { 539 if ((gdqp = dqp->q_gdquot)) {
523 xfs_dqlock(gdqp); 540 xfs_dqlock(gdqp);
@@ -530,15 +547,14 @@ xfs_qm_detach_gdquots(
530 * Can't hold the mplist lock across a dqput. 547 * Can't hold the mplist lock across a dqput.
531 * XXXmust convert to marker based iterations here. 548 * XXXmust convert to marker based iterations here.
532 */ 549 */
533 nrecl = XFS_QI_MPLRECLAIMS(mp); 550 nrecl = q->qi_dqreclaims;
534 xfs_qm_mplist_unlock(mp); 551 mutex_unlock(&q->qi_dqlist_lock);
535 xfs_qm_dqput(gdqp); 552 xfs_qm_dqput(gdqp);
536 553
537 xfs_qm_mplist_lock(mp); 554 mutex_lock(&q->qi_dqlist_lock);
538 if (nrecl != XFS_QI_MPLRECLAIMS(mp)) 555 if (nrecl != q->qi_dqreclaims)
539 goto again; 556 goto again;
540 } 557 }
541 dqp = dqp->MPL_NEXT;
542 } 558 }
543} 559}
544 560
@@ -550,23 +566,23 @@ xfs_qm_detach_gdquots(
550 */ 566 */
551STATIC int 567STATIC int
552xfs_qm_dqpurge_int( 568xfs_qm_dqpurge_int(
553 xfs_mount_t *mp, 569 struct xfs_mount *mp,
554 uint flags) /* QUOTAOFF/UMOUNTING/UQUOTA/PQUOTA/GQUOTA */ 570 uint flags)
555{ 571{
556 xfs_dquot_t *dqp; 572 struct xfs_quotainfo *q = mp->m_quotainfo;
557 uint dqtype; 573 struct xfs_dquot *dqp, *n;
558 int nrecl; 574 uint dqtype;
559 xfs_dquot_t *nextdqp; 575 int nrecl;
560 int nmisses; 576 int nmisses;
561 577
562 if (mp->m_quotainfo == NULL) 578 if (!q)
563 return 0; 579 return 0;
564 580
565 dqtype = (flags & XFS_QMOPT_UQUOTA) ? XFS_DQ_USER : 0; 581 dqtype = (flags & XFS_QMOPT_UQUOTA) ? XFS_DQ_USER : 0;
566 dqtype |= (flags & XFS_QMOPT_PQUOTA) ? XFS_DQ_PROJ : 0; 582 dqtype |= (flags & XFS_QMOPT_PQUOTA) ? XFS_DQ_PROJ : 0;
567 dqtype |= (flags & XFS_QMOPT_GQUOTA) ? XFS_DQ_GROUP : 0; 583 dqtype |= (flags & XFS_QMOPT_GQUOTA) ? XFS_DQ_GROUP : 0;
568 584
569 xfs_qm_mplist_lock(mp); 585 mutex_lock(&q->qi_dqlist_lock);
570 586
571 /* 587 /*
572 * In the first pass through all incore dquots of this filesystem, 588 * In the first pass through all incore dquots of this filesystem,
@@ -578,28 +594,25 @@ xfs_qm_dqpurge_int(
578 594
579 again: 595 again:
580 nmisses = 0; 596 nmisses = 0;
581 ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp)); 597 ASSERT(mutex_is_locked(&q->qi_dqlist_lock));
582 /* 598 /*
583 * Try to get rid of all of the unwanted dquots. The idea is to 599 * Try to get rid of all of the unwanted dquots. The idea is to
584 * get them off mplist and hashlist, but leave them on freelist. 600 * get them off mplist and hashlist, but leave them on freelist.
585 */ 601 */
586 dqp = XFS_QI_MPLNEXT(mp); 602 list_for_each_entry_safe(dqp, n, &q->qi_dqlist, q_mplist) {
587 while (dqp) {
588 /* 603 /*
589 * It's OK to look at the type without taking dqlock here. 604 * It's OK to look at the type without taking dqlock here.
590 * We're holding the mplist lock here, and that's needed for 605 * We're holding the mplist lock here, and that's needed for
591 * a dqreclaim. 606 * a dqreclaim.
592 */ 607 */
593 if ((dqp->dq_flags & dqtype) == 0) { 608 if ((dqp->dq_flags & dqtype) == 0)
594 dqp = dqp->MPL_NEXT;
595 continue; 609 continue;
596 }
597 610
598 if (!mutex_trylock(&dqp->q_hash->qh_lock)) { 611 if (!mutex_trylock(&dqp->q_hash->qh_lock)) {
599 nrecl = XFS_QI_MPLRECLAIMS(mp); 612 nrecl = q->qi_dqreclaims;
600 xfs_qm_mplist_unlock(mp); 613 mutex_unlock(&q->qi_dqlist_lock);
601 mutex_lock(&dqp->q_hash->qh_lock); 614 mutex_lock(&dqp->q_hash->qh_lock);
602 xfs_qm_mplist_lock(mp); 615 mutex_lock(&q->qi_dqlist_lock);
603 616
604 /* 617 /*
605 * XXXTheoretically, we can get into a very long 618 * XXXTheoretically, we can get into a very long
@@ -607,7 +620,7 @@ xfs_qm_dqpurge_int(
607 * No one can be adding dquots to the mplist at 620 * No one can be adding dquots to the mplist at
608 * this point, but somebody might be taking things off. 621 * this point, but somebody might be taking things off.
609 */ 622 */
610 if (nrecl != XFS_QI_MPLRECLAIMS(mp)) { 623 if (nrecl != q->qi_dqreclaims) {
611 mutex_unlock(&dqp->q_hash->qh_lock); 624 mutex_unlock(&dqp->q_hash->qh_lock);
612 goto again; 625 goto again;
613 } 626 }
@@ -617,11 +630,9 @@ xfs_qm_dqpurge_int(
617 * Take the dquot off the mplist and hashlist. It may remain on 630 * Take the dquot off the mplist and hashlist. It may remain on
618 * freelist in INACTIVE state. 631 * freelist in INACTIVE state.
619 */ 632 */
620 nextdqp = dqp->MPL_NEXT;
621 nmisses += xfs_qm_dqpurge(dqp); 633 nmisses += xfs_qm_dqpurge(dqp);
622 dqp = nextdqp;
623 } 634 }
624 xfs_qm_mplist_unlock(mp); 635 mutex_unlock(&q->qi_dqlist_lock);
625 return nmisses; 636 return nmisses;
626} 637}
627 638
@@ -921,12 +932,13 @@ xfs_qm_dqdetach(
921 932
922int 933int
923xfs_qm_sync( 934xfs_qm_sync(
924 xfs_mount_t *mp, 935 struct xfs_mount *mp,
925 int flags) 936 int flags)
926{ 937{
927 int recl, restarts; 938 struct xfs_quotainfo *q = mp->m_quotainfo;
928 xfs_dquot_t *dqp; 939 int recl, restarts;
929 int error; 940 struct xfs_dquot *dqp;
941 int error;
930 942
931 if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) 943 if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
932 return 0; 944 return 0;
@@ -934,18 +946,19 @@ xfs_qm_sync(
934 restarts = 0; 946 restarts = 0;
935 947
936 again: 948 again:
937 xfs_qm_mplist_lock(mp); 949 mutex_lock(&q->qi_dqlist_lock);
938 /* 950 /*
939 * dqpurge_all() also takes the mplist lock and iterate thru all dquots 951 * dqpurge_all() also takes the mplist lock and iterate thru all dquots
940 * in quotaoff. However, if the QUOTA_ACTIVE bits are not cleared 952 * in quotaoff. However, if the QUOTA_ACTIVE bits are not cleared
941 * when we have the mplist lock, we know that dquots will be consistent 953 * when we have the mplist lock, we know that dquots will be consistent
942 * as long as we have it locked. 954 * as long as we have it locked.
943 */ 955 */
944 if (! XFS_IS_QUOTA_ON(mp)) { 956 if (!XFS_IS_QUOTA_ON(mp)) {
945 xfs_qm_mplist_unlock(mp); 957 mutex_unlock(&q->qi_dqlist_lock);
946 return 0; 958 return 0;
947 } 959 }
948 FOREACH_DQUOT_IN_MP(dqp, mp) { 960 ASSERT(mutex_is_locked(&q->qi_dqlist_lock));
961 list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) {
949 /* 962 /*
950 * If this is vfs_sync calling, then skip the dquots that 963 * If this is vfs_sync calling, then skip the dquots that
951 * don't 'seem' to be dirty. ie. don't acquire dqlock. 964 * don't 'seem' to be dirty. ie. don't acquire dqlock.
@@ -969,7 +982,7 @@ xfs_qm_sync(
969 } 982 }
970 983
971 /* XXX a sentinel would be better */ 984 /* XXX a sentinel would be better */
972 recl = XFS_QI_MPLRECLAIMS(mp); 985 recl = q->qi_dqreclaims;
973 if (!xfs_dqflock_nowait(dqp)) { 986 if (!xfs_dqflock_nowait(dqp)) {
974 if (flags & SYNC_TRYLOCK) { 987 if (flags & SYNC_TRYLOCK) {
975 xfs_dqunlock(dqp); 988 xfs_dqunlock(dqp);
@@ -989,7 +1002,7 @@ xfs_qm_sync(
989 * Let go of the mplist lock. We don't want to hold it 1002 * Let go of the mplist lock. We don't want to hold it
990 * across a disk write 1003 * across a disk write
991 */ 1004 */
992 xfs_qm_mplist_unlock(mp); 1005 mutex_unlock(&q->qi_dqlist_lock);
993 error = xfs_qm_dqflush(dqp, flags); 1006 error = xfs_qm_dqflush(dqp, flags);
994 xfs_dqunlock(dqp); 1007 xfs_dqunlock(dqp);
995 if (error && XFS_FORCED_SHUTDOWN(mp)) 1008 if (error && XFS_FORCED_SHUTDOWN(mp))
@@ -997,17 +1010,17 @@ xfs_qm_sync(
997 else if (error) 1010 else if (error)
998 return error; 1011 return error;
999 1012
1000 xfs_qm_mplist_lock(mp); 1013 mutex_lock(&q->qi_dqlist_lock);
1001 if (recl != XFS_QI_MPLRECLAIMS(mp)) { 1014 if (recl != q->qi_dqreclaims) {
1002 if (++restarts >= XFS_QM_SYNC_MAX_RESTARTS) 1015 if (++restarts >= XFS_QM_SYNC_MAX_RESTARTS)
1003 break; 1016 break;
1004 1017
1005 xfs_qm_mplist_unlock(mp); 1018 mutex_unlock(&q->qi_dqlist_lock);
1006 goto again; 1019 goto again;
1007 } 1020 }
1008 } 1021 }
1009 1022
1010 xfs_qm_mplist_unlock(mp); 1023 mutex_unlock(&q->qi_dqlist_lock);
1011 return 0; 1024 return 0;
1012} 1025}
1013 1026
@@ -1052,8 +1065,9 @@ xfs_qm_init_quotainfo(
1052 return error; 1065 return error;
1053 } 1066 }
1054 1067
1055 xfs_qm_list_init(&qinf->qi_dqlist, "mpdqlist", 0); 1068 INIT_LIST_HEAD(&qinf->qi_dqlist);
1056 lockdep_set_class(&qinf->qi_dqlist.qh_lock, &xfs_quota_mplist_class); 1069 mutex_init(&qinf->qi_dqlist_lock);
1070 lockdep_set_class(&qinf->qi_dqlist_lock, &xfs_quota_mplist_class);
1057 1071
1058 qinf->qi_dqreclaims = 0; 1072 qinf->qi_dqreclaims = 0;
1059 1073
@@ -1150,7 +1164,8 @@ xfs_qm_destroy_quotainfo(
1150 */ 1164 */
1151 xfs_qm_rele_quotafs_ref(mp); 1165 xfs_qm_rele_quotafs_ref(mp);
1152 1166
1153 xfs_qm_list_destroy(&qi->qi_dqlist); 1167 ASSERT(list_empty(&qi->qi_dqlist));
1168 mutex_destroy(&qi->qi_dqlist_lock);
1154 1169
1155 if (qi->qi_uquotaip) { 1170 if (qi->qi_uquotaip) {
1156 IRELE(qi->qi_uquotaip); 1171 IRELE(qi->qi_uquotaip);
@@ -1177,7 +1192,7 @@ xfs_qm_list_init(
1177 int n) 1192 int n)
1178{ 1193{
1179 mutex_init(&list->qh_lock); 1194 mutex_init(&list->qh_lock);
1180 list->qh_next = NULL; 1195 INIT_LIST_HEAD(&list->qh_list);
1181 list->qh_version = 0; 1196 list->qh_version = 0;
1182 list->qh_nelems = 0; 1197 list->qh_nelems = 0;
1183} 1198}
@@ -1316,9 +1331,6 @@ xfs_qm_qino_alloc(
1316 */ 1331 */
1317 spin_lock(&mp->m_sb_lock); 1332 spin_lock(&mp->m_sb_lock);
1318 if (flags & XFS_QMOPT_SBVERSION) { 1333 if (flags & XFS_QMOPT_SBVERSION) {
1319#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
1320 unsigned oldv = mp->m_sb.sb_versionnum;
1321#endif
1322 ASSERT(!xfs_sb_version_hasquota(&mp->m_sb)); 1334 ASSERT(!xfs_sb_version_hasquota(&mp->m_sb));
1323 ASSERT((sbfields & (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | 1335 ASSERT((sbfields & (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
1324 XFS_SB_GQUOTINO | XFS_SB_QFLAGS)) == 1336 XFS_SB_GQUOTINO | XFS_SB_QFLAGS)) ==
@@ -1331,11 +1343,6 @@ xfs_qm_qino_alloc(
1331 1343
1332 /* qflags will get updated _after_ quotacheck */ 1344 /* qflags will get updated _after_ quotacheck */
1333 mp->m_sb.sb_qflags = 0; 1345 mp->m_sb.sb_qflags = 0;
1334#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
1335 cmn_err(CE_NOTE,
1336 "Old superblock version %x, converting to %x.",
1337 oldv, mp->m_sb.sb_versionnum);
1338#endif
1339 } 1346 }
1340 if (flags & XFS_QMOPT_UQUOTA) 1347 if (flags & XFS_QMOPT_UQUOTA)
1341 mp->m_sb.sb_uquotino = (*ip)->i_ino; 1348 mp->m_sb.sb_uquotino = (*ip)->i_ino;
@@ -1371,10 +1378,10 @@ xfs_qm_reset_dqcounts(
1371#ifdef DEBUG 1378#ifdef DEBUG
1372 j = XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB); 1379 j = XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB);
1373 do_div(j, sizeof(xfs_dqblk_t)); 1380 do_div(j, sizeof(xfs_dqblk_t));
1374 ASSERT(XFS_QM_DQPERBLK(mp) == j); 1381 ASSERT(mp->m_quotainfo->qi_dqperchunk == j);
1375#endif 1382#endif
1376 ddq = (xfs_disk_dquot_t *)XFS_BUF_PTR(bp); 1383 ddq = (xfs_disk_dquot_t *)XFS_BUF_PTR(bp);
1377 for (j = 0; j < XFS_QM_DQPERBLK(mp); j++) { 1384 for (j = 0; j < mp->m_quotainfo->qi_dqperchunk; j++) {
1378 /* 1385 /*
1379 * Do a sanity check, and if needed, repair the dqblk. Don't 1386 * Do a sanity check, and if needed, repair the dqblk. Don't
1380 * output any warnings because it's perfectly possible to 1387 * output any warnings because it's perfectly possible to
@@ -1429,7 +1436,7 @@ xfs_qm_dqiter_bufs(
1429 while (blkcnt--) { 1436 while (blkcnt--) {
1430 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, 1437 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
1431 XFS_FSB_TO_DADDR(mp, bno), 1438 XFS_FSB_TO_DADDR(mp, bno),
1432 (int)XFS_QI_DQCHUNKLEN(mp), 0, &bp); 1439 mp->m_quotainfo->qi_dqchunklen, 0, &bp);
1433 if (error) 1440 if (error)
1434 break; 1441 break;
1435 1442
@@ -1439,7 +1446,7 @@ xfs_qm_dqiter_bufs(
1439 * goto the next block. 1446 * goto the next block.
1440 */ 1447 */
1441 bno++; 1448 bno++;
1442 firstid += XFS_QM_DQPERBLK(mp); 1449 firstid += mp->m_quotainfo->qi_dqperchunk;
1443 } 1450 }
1444 return error; 1451 return error;
1445} 1452}
@@ -1505,7 +1512,7 @@ xfs_qm_dqiterate(
1505 continue; 1512 continue;
1506 1513
1507 firstid = (xfs_dqid_t) map[i].br_startoff * 1514 firstid = (xfs_dqid_t) map[i].br_startoff *
1508 XFS_QM_DQPERBLK(mp); 1515 mp->m_quotainfo->qi_dqperchunk;
1509 /* 1516 /*
1510 * Do a read-ahead on the next extent. 1517 * Do a read-ahead on the next extent.
1511 */ 1518 */
@@ -1516,7 +1523,7 @@ xfs_qm_dqiterate(
1516 while (rablkcnt--) { 1523 while (rablkcnt--) {
1517 xfs_baread(mp->m_ddev_targp, 1524 xfs_baread(mp->m_ddev_targp,
1518 XFS_FSB_TO_DADDR(mp, rablkno), 1525 XFS_FSB_TO_DADDR(mp, rablkno),
1519 (int)XFS_QI_DQCHUNKLEN(mp)); 1526 mp->m_quotainfo->qi_dqchunklen);
1520 rablkno++; 1527 rablkno++;
1521 } 1528 }
1522 } 1529 }
@@ -1576,8 +1583,10 @@ xfs_qm_quotacheck_dqadjust(
1576 1583
1577 /* 1584 /*
1578 * Set default limits, adjust timers (since we changed usages) 1585 * Set default limits, adjust timers (since we changed usages)
1586 *
1587 * There are no timers for the default values set in the root dquot.
1579 */ 1588 */
1580 if (! XFS_IS_SUSER_DQUOT(dqp)) { 1589 if (dqp->q_core.d_id) {
1581 xfs_qm_adjust_dqlimits(dqp->q_mount, &dqp->q_core); 1590 xfs_qm_adjust_dqlimits(dqp->q_mount, &dqp->q_core);
1582 xfs_qm_adjust_dqtimers(dqp->q_mount, &dqp->q_core); 1591 xfs_qm_adjust_dqtimers(dqp->q_mount, &dqp->q_core);
1583 } 1592 }
@@ -1747,14 +1756,14 @@ xfs_qm_quotacheck(
1747 lastino = 0; 1756 lastino = 0;
1748 flags = 0; 1757 flags = 0;
1749 1758
1750 ASSERT(XFS_QI_UQIP(mp) || XFS_QI_GQIP(mp)); 1759 ASSERT(mp->m_quotainfo->qi_uquotaip || mp->m_quotainfo->qi_gquotaip);
1751 ASSERT(XFS_IS_QUOTA_RUNNING(mp)); 1760 ASSERT(XFS_IS_QUOTA_RUNNING(mp));
1752 1761
1753 /* 1762 /*
1754 * There should be no cached dquots. The (simplistic) quotacheck 1763 * There should be no cached dquots. The (simplistic) quotacheck
1755 * algorithm doesn't like that. 1764 * algorithm doesn't like that.
1756 */ 1765 */
1757 ASSERT(XFS_QI_MPLNDQUOTS(mp) == 0); 1766 ASSERT(list_empty(&mp->m_quotainfo->qi_dqlist));
1758 1767
1759 cmn_err(CE_NOTE, "XFS quotacheck %s: Please wait.", mp->m_fsname); 1768 cmn_err(CE_NOTE, "XFS quotacheck %s: Please wait.", mp->m_fsname);
1760 1769
@@ -1763,15 +1772,19 @@ xfs_qm_quotacheck(
1763 * their counters to zero. We need a clean slate. 1772 * their counters to zero. We need a clean slate.
1764 * We don't log our changes till later. 1773 * We don't log our changes till later.
1765 */ 1774 */
1766 if ((uip = XFS_QI_UQIP(mp))) { 1775 uip = mp->m_quotainfo->qi_uquotaip;
1767 if ((error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA))) 1776 if (uip) {
1777 error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA);
1778 if (error)
1768 goto error_return; 1779 goto error_return;
1769 flags |= XFS_UQUOTA_CHKD; 1780 flags |= XFS_UQUOTA_CHKD;
1770 } 1781 }
1771 1782
1772 if ((gip = XFS_QI_GQIP(mp))) { 1783 gip = mp->m_quotainfo->qi_gquotaip;
1773 if ((error = xfs_qm_dqiterate(mp, gip, XFS_IS_GQUOTA_ON(mp) ? 1784 if (gip) {
1774 XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA))) 1785 error = xfs_qm_dqiterate(mp, gip, XFS_IS_GQUOTA_ON(mp) ?
1786 XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA);
1787 if (error)
1775 goto error_return; 1788 goto error_return;
1776 flags |= XFS_OQUOTA_CHKD; 1789 flags |= XFS_OQUOTA_CHKD;
1777 } 1790 }
@@ -1804,7 +1817,7 @@ xfs_qm_quotacheck(
1804 * at this point (because we intentionally didn't in dqget_noattach). 1817 * at this point (because we intentionally didn't in dqget_noattach).
1805 */ 1818 */
1806 if (error) { 1819 if (error) {
1807 xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_QUOTAOFF); 1820 xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL);
1808 goto error_return; 1821 goto error_return;
1809 } 1822 }
1810 1823
@@ -1825,7 +1838,7 @@ xfs_qm_quotacheck(
1825 mp->m_qflags &= ~(XFS_OQUOTA_CHKD | XFS_UQUOTA_CHKD); 1838 mp->m_qflags &= ~(XFS_OQUOTA_CHKD | XFS_UQUOTA_CHKD);
1826 mp->m_qflags |= flags; 1839 mp->m_qflags |= flags;
1827 1840
1828 XQM_LIST_PRINT(&(XFS_QI_MPL_LIST(mp)), MPL_NEXT, "++++ Mp list +++"); 1841 xfs_qm_dquot_list_print(mp);
1829 1842
1830 error_return: 1843 error_return:
1831 if (error) { 1844 if (error) {
@@ -1920,59 +1933,53 @@ xfs_qm_init_quotainos(
1920 } 1933 }
1921 } 1934 }
1922 1935
1923 XFS_QI_UQIP(mp) = uip; 1936 mp->m_quotainfo->qi_uquotaip = uip;
1924 XFS_QI_GQIP(mp) = gip; 1937 mp->m_quotainfo->qi_gquotaip = gip;
1925 1938
1926 return 0; 1939 return 0;
1927} 1940}
1928 1941
1929 1942
1943
1930/* 1944/*
1931 * Traverse the freelist of dquots and attempt to reclaim a maximum of 1945 * Just pop the least recently used dquot off the freelist and
1932 * 'howmany' dquots. This operation races with dqlookup(), and attempts to 1946 * recycle it. The returned dquot is locked.
1933 * favor the lookup function ...
1934 * XXXsup merge this with qm_reclaim_one().
1935 */ 1947 */
1936STATIC int 1948STATIC xfs_dquot_t *
1937xfs_qm_shake_freelist( 1949xfs_qm_dqreclaim_one(void)
1938 int howmany)
1939{ 1950{
1940 int nreclaimed; 1951 xfs_dquot_t *dqpout;
1941 xfs_dqhash_t *hash; 1952 xfs_dquot_t *dqp;
1942 xfs_dquot_t *dqp, *nextdqp;
1943 int restarts; 1953 int restarts;
1944 int nflushes;
1945
1946 if (howmany <= 0)
1947 return 0;
1948 1954
1949 nreclaimed = 0;
1950 restarts = 0; 1955 restarts = 0;
1951 nflushes = 0; 1956 dqpout = NULL;
1952 1957
1953#ifdef QUOTADEBUG 1958 /* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */
1954 cmn_err(CE_DEBUG, "Shake free 0x%x", howmany); 1959startagain:
1955#endif 1960 mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
1956 /* lock order is : hashchainlock, freelistlock, mplistlock */
1957 tryagain:
1958 xfs_qm_freelist_lock(xfs_Gqm);
1959 1961
1960 for (dqp = xfs_Gqm->qm_dqfreelist.qh_next; 1962 list_for_each_entry(dqp, &xfs_Gqm->qm_dqfrlist, q_freelist) {
1961 ((dqp != (xfs_dquot_t *) &xfs_Gqm->qm_dqfreelist) && 1963 struct xfs_mount *mp = dqp->q_mount;
1962 nreclaimed < howmany); ) {
1963 xfs_dqlock(dqp); 1964 xfs_dqlock(dqp);
1964 1965
1965 /* 1966 /*
1966 * We are racing with dqlookup here. Naturally we don't 1967 * We are racing with dqlookup here. Naturally we don't
1967 * want to reclaim a dquot that lookup wants. 1968 * want to reclaim a dquot that lookup wants. We release the
1969 * freelist lock and start over, so that lookup will grab
1970 * both the dquot and the freelistlock.
1968 */ 1971 */
1969 if (dqp->dq_flags & XFS_DQ_WANT) { 1972 if (dqp->dq_flags & XFS_DQ_WANT) {
1973 ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE));
1974
1975 trace_xfs_dqreclaim_want(dqp);
1976
1970 xfs_dqunlock(dqp); 1977 xfs_dqunlock(dqp);
1971 xfs_qm_freelist_unlock(xfs_Gqm); 1978 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
1972 if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS) 1979 if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
1973 return nreclaimed; 1980 return NULL;
1974 XQM_STATS_INC(xqmstats.xs_qm_dqwants); 1981 XQM_STATS_INC(xqmstats.xs_qm_dqwants);
1975 goto tryagain; 1982 goto startagain;
1976 } 1983 }
1977 1984
1978 /* 1985 /*
@@ -1981,23 +1988,27 @@ xfs_qm_shake_freelist(
1981 * life easier. 1988 * life easier.
1982 */ 1989 */
1983 if (dqp->dq_flags & XFS_DQ_INACTIVE) { 1990 if (dqp->dq_flags & XFS_DQ_INACTIVE) {
1984 ASSERT(dqp->q_mount == NULL); 1991 ASSERT(mp == NULL);
1985 ASSERT(! XFS_DQ_IS_DIRTY(dqp)); 1992 ASSERT(! XFS_DQ_IS_DIRTY(dqp));
1986 ASSERT(dqp->HL_PREVP == NULL); 1993 ASSERT(list_empty(&dqp->q_hashlist));
1987 ASSERT(dqp->MPL_PREVP == NULL); 1994 ASSERT(list_empty(&dqp->q_mplist));
1995 list_del_init(&dqp->q_freelist);
1996 xfs_Gqm->qm_dqfrlist_cnt--;
1997 xfs_dqunlock(dqp);
1998 dqpout = dqp;
1988 XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims); 1999 XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims);
1989 nextdqp = dqp->dq_flnext; 2000 break;
1990 goto off_freelist;
1991 } 2001 }
1992 2002
1993 ASSERT(dqp->MPL_PREVP); 2003 ASSERT(dqp->q_hash);
2004 ASSERT(!list_empty(&dqp->q_mplist));
2005
1994 /* 2006 /*
1995 * Try to grab the flush lock. If this dquot is in the process of 2007 * Try to grab the flush lock. If this dquot is in the process of
1996 * getting flushed to disk, we don't want to reclaim it. 2008 * getting flushed to disk, we don't want to reclaim it.
1997 */ 2009 */
1998 if (!xfs_dqflock_nowait(dqp)) { 2010 if (!xfs_dqflock_nowait(dqp)) {
1999 xfs_dqunlock(dqp); 2011 xfs_dqunlock(dqp);
2000 dqp = dqp->dq_flnext;
2001 continue; 2012 continue;
2002 } 2013 }
2003 2014
@@ -2010,21 +2021,21 @@ xfs_qm_shake_freelist(
2010 if (XFS_DQ_IS_DIRTY(dqp)) { 2021 if (XFS_DQ_IS_DIRTY(dqp)) {
2011 int error; 2022 int error;
2012 2023
2013 trace_xfs_dqshake_dirty(dqp); 2024 trace_xfs_dqreclaim_dirty(dqp);
2014 2025
2015 /* 2026 /*
2016 * We flush it delayed write, so don't bother 2027 * We flush it delayed write, so don't bother
2017 * releasing the mplock. 2028 * releasing the freelist lock.
2018 */ 2029 */
2019 error = xfs_qm_dqflush(dqp, 0); 2030 error = xfs_qm_dqflush(dqp, 0);
2020 if (error) { 2031 if (error) {
2021 xfs_fs_cmn_err(CE_WARN, dqp->q_mount, 2032 xfs_fs_cmn_err(CE_WARN, mp,
2022 "xfs_qm_dqflush_all: dquot %p flush failed", dqp); 2033 "xfs_qm_dqreclaim: dquot %p flush failed", dqp);
2023 } 2034 }
2024 xfs_dqunlock(dqp); /* dqflush unlocks dqflock */ 2035 xfs_dqunlock(dqp); /* dqflush unlocks dqflock */
2025 dqp = dqp->dq_flnext;
2026 continue; 2036 continue;
2027 } 2037 }
2038
2028 /* 2039 /*
2029 * We're trying to get the hashlock out of order. This races 2040 * We're trying to get the hashlock out of order. This races
2030 * with dqlookup; so, we giveup and goto the next dquot if 2041 * with dqlookup; so, we giveup and goto the next dquot if
@@ -2033,56 +2044,74 @@ xfs_qm_shake_freelist(
2033 * waiting for the freelist lock. 2044 * waiting for the freelist lock.
2034 */ 2045 */
2035 if (!mutex_trylock(&dqp->q_hash->qh_lock)) { 2046 if (!mutex_trylock(&dqp->q_hash->qh_lock)) {
2036 xfs_dqfunlock(dqp); 2047 restarts++;
2037 xfs_dqunlock(dqp); 2048 goto dqfunlock;
2038 dqp = dqp->dq_flnext;
2039 continue;
2040 } 2049 }
2050
2041 /* 2051 /*
2042 * This races with dquot allocation code as well as dqflush_all 2052 * This races with dquot allocation code as well as dqflush_all
2043 * and reclaim code. So, if we failed to grab the mplist lock, 2053 * and reclaim code. So, if we failed to grab the mplist lock,
2044 * giveup everything and start over. 2054 * giveup everything and start over.
2045 */ 2055 */
2046 hash = dqp->q_hash; 2056 if (!mutex_trylock(&mp->m_quotainfo->qi_dqlist_lock)) {
2047 ASSERT(hash); 2057 restarts++;
2048 if (! xfs_qm_mplist_nowait(dqp->q_mount)) { 2058 mutex_unlock(&dqp->q_hash->qh_lock);
2049 /* XXX put a sentinel so that we can come back here */
2050 xfs_dqfunlock(dqp); 2059 xfs_dqfunlock(dqp);
2051 xfs_dqunlock(dqp); 2060 xfs_dqunlock(dqp);
2052 mutex_unlock(&hash->qh_lock); 2061 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
2053 xfs_qm_freelist_unlock(xfs_Gqm); 2062 if (restarts++ >= XFS_QM_RECLAIM_MAX_RESTARTS)
2054 if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS) 2063 return NULL;
2055 return nreclaimed; 2064 goto startagain;
2056 goto tryagain;
2057 } 2065 }
2058 2066
2059 trace_xfs_dqshake_unlink(dqp);
2060
2061#ifdef QUOTADEBUG
2062 cmn_err(CE_DEBUG, "Shake 0x%p, ID 0x%x\n",
2063 dqp, be32_to_cpu(dqp->q_core.d_id));
2064#endif
2065 ASSERT(dqp->q_nrefs == 0); 2067 ASSERT(dqp->q_nrefs == 0);
2066 nextdqp = dqp->dq_flnext; 2068 list_del_init(&dqp->q_mplist);
2067 XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(dqp->q_mount)), dqp); 2069 mp->m_quotainfo->qi_dquots--;
2068 XQM_HASHLIST_REMOVE(hash, dqp); 2070 mp->m_quotainfo->qi_dqreclaims++;
2071 list_del_init(&dqp->q_hashlist);
2072 dqp->q_hash->qh_version++;
2073 list_del_init(&dqp->q_freelist);
2074 xfs_Gqm->qm_dqfrlist_cnt--;
2075 dqpout = dqp;
2076 mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock);
2077 mutex_unlock(&dqp->q_hash->qh_lock);
2078dqfunlock:
2069 xfs_dqfunlock(dqp); 2079 xfs_dqfunlock(dqp);
2070 xfs_qm_mplist_unlock(dqp->q_mount);
2071 mutex_unlock(&hash->qh_lock);
2072
2073 off_freelist:
2074 XQM_FREELIST_REMOVE(dqp);
2075 xfs_dqunlock(dqp); 2080 xfs_dqunlock(dqp);
2076 nreclaimed++; 2081 if (dqpout)
2077 XQM_STATS_INC(xqmstats.xs_qm_dqshake_reclaims); 2082 break;
2083 if (restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
2084 return NULL;
2085 }
2086 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
2087 return dqpout;
2088}
2089
2090/*
2091 * Traverse the freelist of dquots and attempt to reclaim a maximum of
2092 * 'howmany' dquots. This operation races with dqlookup(), and attempts to
2093 * favor the lookup function ...
2094 */
2095STATIC int
2096xfs_qm_shake_freelist(
2097 int howmany)
2098{
2099 int nreclaimed = 0;
2100 xfs_dquot_t *dqp;
2101
2102 if (howmany <= 0)
2103 return 0;
2104
2105 while (nreclaimed < howmany) {
2106 dqp = xfs_qm_dqreclaim_one();
2107 if (!dqp)
2108 return nreclaimed;
2078 xfs_qm_dqdestroy(dqp); 2109 xfs_qm_dqdestroy(dqp);
2079 dqp = nextdqp; 2110 nreclaimed++;
2080 } 2111 }
2081 xfs_qm_freelist_unlock(xfs_Gqm);
2082 return nreclaimed; 2112 return nreclaimed;
2083} 2113}
2084 2114
2085
2086/* 2115/*
2087 * The kmem_shake interface is invoked when memory is running low. 2116 * The kmem_shake interface is invoked when memory is running low.
2088 */ 2117 */
@@ -2097,7 +2126,7 @@ xfs_qm_shake(int nr_to_scan, gfp_t gfp_mask)
2097 if (!xfs_Gqm) 2126 if (!xfs_Gqm)
2098 return 0; 2127 return 0;
2099 2128
2100 nfree = xfs_Gqm->qm_dqfreelist.qh_nelems; /* free dquots */ 2129 nfree = xfs_Gqm->qm_dqfrlist_cnt; /* free dquots */
2101 /* incore dquots in all f/s's */ 2130 /* incore dquots in all f/s's */
2102 ndqused = atomic_read(&xfs_Gqm->qm_totaldquots) - nfree; 2131 ndqused = atomic_read(&xfs_Gqm->qm_totaldquots) - nfree;
2103 2132
@@ -2113,131 +2142,6 @@ xfs_qm_shake(int nr_to_scan, gfp_t gfp_mask)
2113} 2142}
2114 2143
2115 2144
2116/*
2117 * Just pop the least recently used dquot off the freelist and
2118 * recycle it. The returned dquot is locked.
2119 */
2120STATIC xfs_dquot_t *
2121xfs_qm_dqreclaim_one(void)
2122{
2123 xfs_dquot_t *dqpout;
2124 xfs_dquot_t *dqp;
2125 int restarts;
2126 int nflushes;
2127
2128 restarts = 0;
2129 dqpout = NULL;
2130 nflushes = 0;
2131
2132 /* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */
2133 startagain:
2134 xfs_qm_freelist_lock(xfs_Gqm);
2135
2136 FOREACH_DQUOT_IN_FREELIST(dqp, &(xfs_Gqm->qm_dqfreelist)) {
2137 xfs_dqlock(dqp);
2138
2139 /*
2140 * We are racing with dqlookup here. Naturally we don't
2141 * want to reclaim a dquot that lookup wants. We release the
2142 * freelist lock and start over, so that lookup will grab
2143 * both the dquot and the freelistlock.
2144 */
2145 if (dqp->dq_flags & XFS_DQ_WANT) {
2146 ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE));
2147
2148 trace_xfs_dqreclaim_want(dqp);
2149
2150 xfs_dqunlock(dqp);
2151 xfs_qm_freelist_unlock(xfs_Gqm);
2152 if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
2153 return NULL;
2154 XQM_STATS_INC(xqmstats.xs_qm_dqwants);
2155 goto startagain;
2156 }
2157
2158 /*
2159 * If the dquot is inactive, we are assured that it is
2160 * not on the mplist or the hashlist, and that makes our
2161 * life easier.
2162 */
2163 if (dqp->dq_flags & XFS_DQ_INACTIVE) {
2164 ASSERT(dqp->q_mount == NULL);
2165 ASSERT(! XFS_DQ_IS_DIRTY(dqp));
2166 ASSERT(dqp->HL_PREVP == NULL);
2167 ASSERT(dqp->MPL_PREVP == NULL);
2168 XQM_FREELIST_REMOVE(dqp);
2169 xfs_dqunlock(dqp);
2170 dqpout = dqp;
2171 XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims);
2172 break;
2173 }
2174
2175 ASSERT(dqp->q_hash);
2176 ASSERT(dqp->MPL_PREVP);
2177
2178 /*
2179 * Try to grab the flush lock. If this dquot is in the process of
2180 * getting flushed to disk, we don't want to reclaim it.
2181 */
2182 if (!xfs_dqflock_nowait(dqp)) {
2183 xfs_dqunlock(dqp);
2184 continue;
2185 }
2186
2187 /*
2188 * We have the flush lock so we know that this is not in the
2189 * process of being flushed. So, if this is dirty, flush it
2190 * DELWRI so that we don't get a freelist infested with
2191 * dirty dquots.
2192 */
2193 if (XFS_DQ_IS_DIRTY(dqp)) {
2194 int error;
2195
2196 trace_xfs_dqreclaim_dirty(dqp);
2197
2198 /*
2199 * We flush it delayed write, so don't bother
2200 * releasing the freelist lock.
2201 */
2202 error = xfs_qm_dqflush(dqp, 0);
2203 if (error) {
2204 xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
2205 "xfs_qm_dqreclaim: dquot %p flush failed", dqp);
2206 }
2207 xfs_dqunlock(dqp); /* dqflush unlocks dqflock */
2208 continue;
2209 }
2210
2211 if (! xfs_qm_mplist_nowait(dqp->q_mount)) {
2212 xfs_dqfunlock(dqp);
2213 xfs_dqunlock(dqp);
2214 continue;
2215 }
2216
2217 if (!mutex_trylock(&dqp->q_hash->qh_lock))
2218 goto mplistunlock;
2219
2220 trace_xfs_dqreclaim_unlink(dqp);
2221
2222 ASSERT(dqp->q_nrefs == 0);
2223 XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(dqp->q_mount)), dqp);
2224 XQM_HASHLIST_REMOVE(dqp->q_hash, dqp);
2225 XQM_FREELIST_REMOVE(dqp);
2226 dqpout = dqp;
2227 mutex_unlock(&dqp->q_hash->qh_lock);
2228 mplistunlock:
2229 xfs_qm_mplist_unlock(dqp->q_mount);
2230 xfs_dqfunlock(dqp);
2231 xfs_dqunlock(dqp);
2232 if (dqpout)
2233 break;
2234 }
2235
2236 xfs_qm_freelist_unlock(xfs_Gqm);
2237 return dqpout;
2238}
2239
2240
2241/*------------------------------------------------------------------*/ 2145/*------------------------------------------------------------------*/
2242 2146
2243/* 2147/*
@@ -2662,66 +2566,3 @@ xfs_qm_vop_create_dqattach(
2662 } 2566 }
2663} 2567}
2664 2568
2665/* ------------- list stuff -----------------*/
2666STATIC void
2667xfs_qm_freelist_init(xfs_frlist_t *ql)
2668{
2669 ql->qh_next = ql->qh_prev = (xfs_dquot_t *) ql;
2670 mutex_init(&ql->qh_lock);
2671 ql->qh_version = 0;
2672 ql->qh_nelems = 0;
2673}
2674
2675STATIC void
2676xfs_qm_freelist_destroy(xfs_frlist_t *ql)
2677{
2678 xfs_dquot_t *dqp, *nextdqp;
2679
2680 mutex_lock(&ql->qh_lock);
2681 for (dqp = ql->qh_next;
2682 dqp != (xfs_dquot_t *)ql; ) {
2683 xfs_dqlock(dqp);
2684 nextdqp = dqp->dq_flnext;
2685#ifdef QUOTADEBUG
2686 cmn_err(CE_DEBUG, "FREELIST destroy 0x%p", dqp);
2687#endif
2688 XQM_FREELIST_REMOVE(dqp);
2689 xfs_dqunlock(dqp);
2690 xfs_qm_dqdestroy(dqp);
2691 dqp = nextdqp;
2692 }
2693 mutex_unlock(&ql->qh_lock);
2694 mutex_destroy(&ql->qh_lock);
2695
2696 ASSERT(ql->qh_nelems == 0);
2697}
2698
2699STATIC void
2700xfs_qm_freelist_insert(xfs_frlist_t *ql, xfs_dquot_t *dq)
2701{
2702 dq->dq_flnext = ql->qh_next;
2703 dq->dq_flprev = (xfs_dquot_t *)ql;
2704 ql->qh_next = dq;
2705 dq->dq_flnext->dq_flprev = dq;
2706 xfs_Gqm->qm_dqfreelist.qh_nelems++;
2707 xfs_Gqm->qm_dqfreelist.qh_version++;
2708}
2709
2710void
2711xfs_qm_freelist_unlink(xfs_dquot_t *dq)
2712{
2713 xfs_dquot_t *next = dq->dq_flnext;
2714 xfs_dquot_t *prev = dq->dq_flprev;
2715
2716 next->dq_flprev = prev;
2717 prev->dq_flnext = next;
2718 dq->dq_flnext = dq->dq_flprev = dq;
2719 xfs_Gqm->qm_dqfreelist.qh_nelems--;
2720 xfs_Gqm->qm_dqfreelist.qh_version++;
2721}
2722
2723void
2724xfs_qm_freelist_append(xfs_frlist_t *ql, xfs_dquot_t *dq)
2725{
2726 xfs_qm_freelist_insert((xfs_frlist_t *)ql->qh_prev, dq);
2727}
diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h
index 495564b8af38..c9446f1c726d 100644
--- a/fs/xfs/quota/xfs_qm.h
+++ b/fs/xfs/quota/xfs_qm.h
@@ -72,17 +72,6 @@ extern kmem_zone_t *qm_dqtrxzone;
72#define XFS_QM_MAX_DQCLUSTER_LOGSZ 3 72#define XFS_QM_MAX_DQCLUSTER_LOGSZ 3
73 73
74typedef xfs_dqhash_t xfs_dqlist_t; 74typedef xfs_dqhash_t xfs_dqlist_t;
75/*
76 * The freelist head. The first two fields match the first two in the
77 * xfs_dquot_t structure (in xfs_dqmarker_t)
78 */
79typedef struct xfs_frlist {
80 struct xfs_dquot *qh_next;
81 struct xfs_dquot *qh_prev;
82 struct mutex qh_lock;
83 uint qh_version;
84 uint qh_nelems;
85} xfs_frlist_t;
86 75
87/* 76/*
88 * Quota Manager (global) structure. Lives only in core. 77 * Quota Manager (global) structure. Lives only in core.
@@ -91,7 +80,9 @@ typedef struct xfs_qm {
91 xfs_dqlist_t *qm_usr_dqhtable;/* udquot hash table */ 80 xfs_dqlist_t *qm_usr_dqhtable;/* udquot hash table */
92 xfs_dqlist_t *qm_grp_dqhtable;/* gdquot hash table */ 81 xfs_dqlist_t *qm_grp_dqhtable;/* gdquot hash table */
93 uint qm_dqhashmask; /* # buckets in dq hashtab - 1 */ 82 uint qm_dqhashmask; /* # buckets in dq hashtab - 1 */
94 xfs_frlist_t qm_dqfreelist; /* freelist of dquots */ 83 struct list_head qm_dqfrlist; /* freelist of dquots */
84 struct mutex qm_dqfrlist_lock;
85 int qm_dqfrlist_cnt;
95 atomic_t qm_totaldquots; /* total incore dquots */ 86 atomic_t qm_totaldquots; /* total incore dquots */
96 uint qm_nrefs; /* file systems with quota on */ 87 uint qm_nrefs; /* file systems with quota on */
97 int qm_dqfree_ratio;/* ratio of free to inuse dquots */ 88 int qm_dqfree_ratio;/* ratio of free to inuse dquots */
@@ -106,7 +97,9 @@ typedef struct xfs_qm {
106typedef struct xfs_quotainfo { 97typedef struct xfs_quotainfo {
107 xfs_inode_t *qi_uquotaip; /* user quota inode */ 98 xfs_inode_t *qi_uquotaip; /* user quota inode */
108 xfs_inode_t *qi_gquotaip; /* group quota inode */ 99 xfs_inode_t *qi_gquotaip; /* group quota inode */
109 xfs_dqlist_t qi_dqlist; /* all dquots in filesys */ 100 struct list_head qi_dqlist; /* all dquots in filesys */
101 struct mutex qi_dqlist_lock;
102 int qi_dquots;
110 int qi_dqreclaims; /* a change here indicates 103 int qi_dqreclaims; /* a change here indicates
111 a removal in the dqlist */ 104 a removal in the dqlist */
112 time_t qi_btimelimit; /* limit for blks timer */ 105 time_t qi_btimelimit; /* limit for blks timer */
@@ -175,10 +168,6 @@ extern int xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *);
175extern int xfs_qm_scall_quotaon(xfs_mount_t *, uint); 168extern int xfs_qm_scall_quotaon(xfs_mount_t *, uint);
176extern int xfs_qm_scall_quotaoff(xfs_mount_t *, uint); 169extern int xfs_qm_scall_quotaoff(xfs_mount_t *, uint);
177 170
178/* list stuff */
179extern void xfs_qm_freelist_append(xfs_frlist_t *, xfs_dquot_t *);
180extern void xfs_qm_freelist_unlink(xfs_dquot_t *);
181
182#ifdef DEBUG 171#ifdef DEBUG
183extern int xfs_qm_internalqcheck(xfs_mount_t *); 172extern int xfs_qm_internalqcheck(xfs_mount_t *);
184#else 173#else
diff --git a/fs/xfs/quota/xfs_qm_stats.c b/fs/xfs/quota/xfs_qm_stats.c
index 83e7ea3e25fa..3d1fc79532e2 100644
--- a/fs/xfs/quota/xfs_qm_stats.c
+++ b/fs/xfs/quota/xfs_qm_stats.c
@@ -55,7 +55,7 @@ static int xqm_proc_show(struct seq_file *m, void *v)
55 ndquot, 55 ndquot,
56 xfs_Gqm? atomic_read(&xfs_Gqm->qm_totaldquots) : 0, 56 xfs_Gqm? atomic_read(&xfs_Gqm->qm_totaldquots) : 0,
57 xfs_Gqm? xfs_Gqm->qm_dqfree_ratio : 0, 57 xfs_Gqm? xfs_Gqm->qm_dqfree_ratio : 0,
58 xfs_Gqm? xfs_Gqm->qm_dqfreelist.qh_nelems : 0); 58 xfs_Gqm? xfs_Gqm->qm_dqfrlist_cnt : 0);
59 return 0; 59 return 0;
60} 60}
61 61
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 5d0ee8d492db..92b002f1805f 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -79,6 +79,7 @@ xfs_qm_scall_quotaoff(
79 xfs_mount_t *mp, 79 xfs_mount_t *mp,
80 uint flags) 80 uint flags)
81{ 81{
82 struct xfs_quotainfo *q = mp->m_quotainfo;
82 uint dqtype; 83 uint dqtype;
83 int error; 84 int error;
84 uint inactivate_flags; 85 uint inactivate_flags;
@@ -102,11 +103,8 @@ xfs_qm_scall_quotaoff(
102 * critical thing. 103 * critical thing.
103 * If quotaoff, then we must be dealing with the root filesystem. 104 * If quotaoff, then we must be dealing with the root filesystem.
104 */ 105 */
105 ASSERT(mp->m_quotainfo); 106 ASSERT(q);
106 if (mp->m_quotainfo) 107 mutex_lock(&q->qi_quotaofflock);
107 mutex_lock(&(XFS_QI_QOFFLOCK(mp)));
108
109 ASSERT(mp->m_quotainfo);
110 108
111 /* 109 /*
112 * If we're just turning off quota enforcement, change mp and go. 110 * If we're just turning off quota enforcement, change mp and go.
@@ -117,7 +115,7 @@ xfs_qm_scall_quotaoff(
117 spin_lock(&mp->m_sb_lock); 115 spin_lock(&mp->m_sb_lock);
118 mp->m_sb.sb_qflags = mp->m_qflags; 116 mp->m_sb.sb_qflags = mp->m_qflags;
119 spin_unlock(&mp->m_sb_lock); 117 spin_unlock(&mp->m_sb_lock);
120 mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); 118 mutex_unlock(&q->qi_quotaofflock);
121 119
122 /* XXX what to do if error ? Revert back to old vals incore ? */ 120 /* XXX what to do if error ? Revert back to old vals incore ? */
123 error = xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS); 121 error = xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS);
@@ -150,10 +148,8 @@ xfs_qm_scall_quotaoff(
150 * Nothing to do? Don't complain. This happens when we're just 148 * Nothing to do? Don't complain. This happens when we're just
151 * turning off quota enforcement. 149 * turning off quota enforcement.
152 */ 150 */
153 if ((mp->m_qflags & flags) == 0) { 151 if ((mp->m_qflags & flags) == 0)
154 mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); 152 goto out_unlock;
155 return (0);
156 }
157 153
158 /* 154 /*
159 * Write the LI_QUOTAOFF log record, and do SB changes atomically, 155 * Write the LI_QUOTAOFF log record, and do SB changes atomically,
@@ -162,7 +158,7 @@ xfs_qm_scall_quotaoff(
162 */ 158 */
163 error = xfs_qm_log_quotaoff(mp, &qoffstart, flags); 159 error = xfs_qm_log_quotaoff(mp, &qoffstart, flags);
164 if (error) 160 if (error)
165 goto out_error; 161 goto out_unlock;
166 162
167 /* 163 /*
168 * Next we clear the XFS_MOUNT_*DQ_ACTIVE bit(s) in the mount struct 164 * Next we clear the XFS_MOUNT_*DQ_ACTIVE bit(s) in the mount struct
@@ -204,7 +200,7 @@ xfs_qm_scall_quotaoff(
204 * So, if we couldn't purge all the dquots from the filesystem, 200 * So, if we couldn't purge all the dquots from the filesystem,
205 * we can't get rid of the incore data structures. 201 * we can't get rid of the incore data structures.
206 */ 202 */
207 while ((nculprits = xfs_qm_dqpurge_all(mp, dqtype|XFS_QMOPT_QUOTAOFF))) 203 while ((nculprits = xfs_qm_dqpurge_all(mp, dqtype)))
208 delay(10 * nculprits); 204 delay(10 * nculprits);
209 205
210 /* 206 /*
@@ -222,7 +218,7 @@ xfs_qm_scall_quotaoff(
222 if (error) { 218 if (error) {
223 /* We're screwed now. Shutdown is the only option. */ 219 /* We're screwed now. Shutdown is the only option. */
224 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 220 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
225 goto out_error; 221 goto out_unlock;
226 } 222 }
227 223
228 /* 224 /*
@@ -230,27 +226,26 @@ xfs_qm_scall_quotaoff(
230 */ 226 */
231 if (((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET1) || 227 if (((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET1) ||
232 ((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET2)) { 228 ((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET2)) {
233 mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); 229 mutex_unlock(&q->qi_quotaofflock);
234 xfs_qm_destroy_quotainfo(mp); 230 xfs_qm_destroy_quotainfo(mp);
235 return (0); 231 return (0);
236 } 232 }
237 233
238 /* 234 /*
239 * Release our quotainode references, and vn_purge them, 235 * Release our quotainode references if we don't need them anymore.
240 * if we don't need them anymore.
241 */ 236 */
242 if ((dqtype & XFS_QMOPT_UQUOTA) && XFS_QI_UQIP(mp)) { 237 if ((dqtype & XFS_QMOPT_UQUOTA) && q->qi_uquotaip) {
243 IRELE(XFS_QI_UQIP(mp)); 238 IRELE(q->qi_uquotaip);
244 XFS_QI_UQIP(mp) = NULL; 239 q->qi_uquotaip = NULL;
245 } 240 }
246 if ((dqtype & (XFS_QMOPT_GQUOTA|XFS_QMOPT_PQUOTA)) && XFS_QI_GQIP(mp)) { 241 if ((dqtype & (XFS_QMOPT_GQUOTA|XFS_QMOPT_PQUOTA)) && q->qi_gquotaip) {
247 IRELE(XFS_QI_GQIP(mp)); 242 IRELE(q->qi_gquotaip);
248 XFS_QI_GQIP(mp) = NULL; 243 q->qi_gquotaip = NULL;
249 } 244 }
250out_error:
251 mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
252 245
253 return (error); 246out_unlock:
247 mutex_unlock(&q->qi_quotaofflock);
248 return error;
254} 249}
255 250
256int 251int
@@ -379,9 +374,9 @@ xfs_qm_scall_quotaon(
379 /* 374 /*
380 * Switch on quota enforcement in core. 375 * Switch on quota enforcement in core.
381 */ 376 */
382 mutex_lock(&(XFS_QI_QOFFLOCK(mp))); 377 mutex_lock(&mp->m_quotainfo->qi_quotaofflock);
383 mp->m_qflags |= (flags & XFS_ALL_QUOTA_ENFD); 378 mp->m_qflags |= (flags & XFS_ALL_QUOTA_ENFD);
384 mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); 379 mutex_unlock(&mp->m_quotainfo->qi_quotaofflock);
385 380
386 return (0); 381 return (0);
387} 382}
@@ -392,11 +387,12 @@ xfs_qm_scall_quotaon(
392 */ 387 */
393int 388int
394xfs_qm_scall_getqstat( 389xfs_qm_scall_getqstat(
395 xfs_mount_t *mp, 390 struct xfs_mount *mp,
396 fs_quota_stat_t *out) 391 struct fs_quota_stat *out)
397{ 392{
398 xfs_inode_t *uip, *gip; 393 struct xfs_quotainfo *q = mp->m_quotainfo;
399 boolean_t tempuqip, tempgqip; 394 struct xfs_inode *uip, *gip;
395 boolean_t tempuqip, tempgqip;
400 396
401 uip = gip = NULL; 397 uip = gip = NULL;
402 tempuqip = tempgqip = B_FALSE; 398 tempuqip = tempgqip = B_FALSE;
@@ -415,9 +411,9 @@ xfs_qm_scall_getqstat(
415 out->qs_uquota.qfs_ino = mp->m_sb.sb_uquotino; 411 out->qs_uquota.qfs_ino = mp->m_sb.sb_uquotino;
416 out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino; 412 out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino;
417 413
418 if (mp->m_quotainfo) { 414 if (q) {
419 uip = mp->m_quotainfo->qi_uquotaip; 415 uip = q->qi_uquotaip;
420 gip = mp->m_quotainfo->qi_gquotaip; 416 gip = q->qi_gquotaip;
421 } 417 }
422 if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) { 418 if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) {
423 if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, 419 if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
@@ -441,17 +437,20 @@ xfs_qm_scall_getqstat(
441 if (tempgqip) 437 if (tempgqip)
442 IRELE(gip); 438 IRELE(gip);
443 } 439 }
444 if (mp->m_quotainfo) { 440 if (q) {
445 out->qs_incoredqs = XFS_QI_MPLNDQUOTS(mp); 441 out->qs_incoredqs = q->qi_dquots;
446 out->qs_btimelimit = XFS_QI_BTIMELIMIT(mp); 442 out->qs_btimelimit = q->qi_btimelimit;
447 out->qs_itimelimit = XFS_QI_ITIMELIMIT(mp); 443 out->qs_itimelimit = q->qi_itimelimit;
448 out->qs_rtbtimelimit = XFS_QI_RTBTIMELIMIT(mp); 444 out->qs_rtbtimelimit = q->qi_rtbtimelimit;
449 out->qs_bwarnlimit = XFS_QI_BWARNLIMIT(mp); 445 out->qs_bwarnlimit = q->qi_bwarnlimit;
450 out->qs_iwarnlimit = XFS_QI_IWARNLIMIT(mp); 446 out->qs_iwarnlimit = q->qi_iwarnlimit;
451 } 447 }
452 return (0); 448 return 0;
453} 449}
454 450
451#define XFS_DQ_MASK \
452 (FS_DQ_LIMIT_MASK | FS_DQ_TIMER_MASK | FS_DQ_WARNS_MASK)
453
455/* 454/*
456 * Adjust quota limits, and start/stop timers accordingly. 455 * Adjust quota limits, and start/stop timers accordingly.
457 */ 456 */
@@ -462,15 +461,17 @@ xfs_qm_scall_setqlim(
462 uint type, 461 uint type,
463 fs_disk_quota_t *newlim) 462 fs_disk_quota_t *newlim)
464{ 463{
464 struct xfs_quotainfo *q = mp->m_quotainfo;
465 xfs_disk_dquot_t *ddq; 465 xfs_disk_dquot_t *ddq;
466 xfs_dquot_t *dqp; 466 xfs_dquot_t *dqp;
467 xfs_trans_t *tp; 467 xfs_trans_t *tp;
468 int error; 468 int error;
469 xfs_qcnt_t hard, soft; 469 xfs_qcnt_t hard, soft;
470 470
471 if ((newlim->d_fieldmask & 471 if (newlim->d_fieldmask & ~XFS_DQ_MASK)
472 (FS_DQ_LIMIT_MASK|FS_DQ_TIMER_MASK|FS_DQ_WARNS_MASK)) == 0) 472 return EINVAL;
473 return (0); 473 if ((newlim->d_fieldmask & XFS_DQ_MASK) == 0)
474 return 0;
474 475
475 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM); 476 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM);
476 if ((error = xfs_trans_reserve(tp, 0, sizeof(xfs_disk_dquot_t) + 128, 477 if ((error = xfs_trans_reserve(tp, 0, sizeof(xfs_disk_dquot_t) + 128,
@@ -485,7 +486,7 @@ xfs_qm_scall_setqlim(
485 * a quotaoff from happening). (XXXThis doesn't currently happen 486 * a quotaoff from happening). (XXXThis doesn't currently happen
486 * because we take the vfslock before calling xfs_qm_sysent). 487 * because we take the vfslock before calling xfs_qm_sysent).
487 */ 488 */
488 mutex_lock(&(XFS_QI_QOFFLOCK(mp))); 489 mutex_lock(&q->qi_quotaofflock);
489 490
490 /* 491 /*
491 * Get the dquot (locked), and join it to the transaction. 492 * Get the dquot (locked), and join it to the transaction.
@@ -493,9 +494,8 @@ xfs_qm_scall_setqlim(
493 */ 494 */
494 if ((error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp))) { 495 if ((error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp))) {
495 xfs_trans_cancel(tp, XFS_TRANS_ABORT); 496 xfs_trans_cancel(tp, XFS_TRANS_ABORT);
496 mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
497 ASSERT(error != ENOENT); 497 ASSERT(error != ENOENT);
498 return (error); 498 goto out_unlock;
499 } 499 }
500 xfs_trans_dqjoin(tp, dqp); 500 xfs_trans_dqjoin(tp, dqp);
501 ddq = &dqp->q_core; 501 ddq = &dqp->q_core;
@@ -513,8 +513,8 @@ xfs_qm_scall_setqlim(
513 ddq->d_blk_hardlimit = cpu_to_be64(hard); 513 ddq->d_blk_hardlimit = cpu_to_be64(hard);
514 ddq->d_blk_softlimit = cpu_to_be64(soft); 514 ddq->d_blk_softlimit = cpu_to_be64(soft);
515 if (id == 0) { 515 if (id == 0) {
516 mp->m_quotainfo->qi_bhardlimit = hard; 516 q->qi_bhardlimit = hard;
517 mp->m_quotainfo->qi_bsoftlimit = soft; 517 q->qi_bsoftlimit = soft;
518 } 518 }
519 } else { 519 } else {
520 qdprintk("blkhard %Ld < blksoft %Ld\n", hard, soft); 520 qdprintk("blkhard %Ld < blksoft %Ld\n", hard, soft);
@@ -529,8 +529,8 @@ xfs_qm_scall_setqlim(
529 ddq->d_rtb_hardlimit = cpu_to_be64(hard); 529 ddq->d_rtb_hardlimit = cpu_to_be64(hard);
530 ddq->d_rtb_softlimit = cpu_to_be64(soft); 530 ddq->d_rtb_softlimit = cpu_to_be64(soft);
531 if (id == 0) { 531 if (id == 0) {
532 mp->m_quotainfo->qi_rtbhardlimit = hard; 532 q->qi_rtbhardlimit = hard;
533 mp->m_quotainfo->qi_rtbsoftlimit = soft; 533 q->qi_rtbsoftlimit = soft;
534 } 534 }
535 } else { 535 } else {
536 qdprintk("rtbhard %Ld < rtbsoft %Ld\n", hard, soft); 536 qdprintk("rtbhard %Ld < rtbsoft %Ld\n", hard, soft);
@@ -546,8 +546,8 @@ xfs_qm_scall_setqlim(
546 ddq->d_ino_hardlimit = cpu_to_be64(hard); 546 ddq->d_ino_hardlimit = cpu_to_be64(hard);
547 ddq->d_ino_softlimit = cpu_to_be64(soft); 547 ddq->d_ino_softlimit = cpu_to_be64(soft);
548 if (id == 0) { 548 if (id == 0) {
549 mp->m_quotainfo->qi_ihardlimit = hard; 549 q->qi_ihardlimit = hard;
550 mp->m_quotainfo->qi_isoftlimit = soft; 550 q->qi_isoftlimit = soft;
551 } 551 }
552 } else { 552 } else {
553 qdprintk("ihard %Ld < isoft %Ld\n", hard, soft); 553 qdprintk("ihard %Ld < isoft %Ld\n", hard, soft);
@@ -572,23 +572,23 @@ xfs_qm_scall_setqlim(
572 * for warnings. 572 * for warnings.
573 */ 573 */
574 if (newlim->d_fieldmask & FS_DQ_BTIMER) { 574 if (newlim->d_fieldmask & FS_DQ_BTIMER) {
575 mp->m_quotainfo->qi_btimelimit = newlim->d_btimer; 575 q->qi_btimelimit = newlim->d_btimer;
576 ddq->d_btimer = cpu_to_be32(newlim->d_btimer); 576 ddq->d_btimer = cpu_to_be32(newlim->d_btimer);
577 } 577 }
578 if (newlim->d_fieldmask & FS_DQ_ITIMER) { 578 if (newlim->d_fieldmask & FS_DQ_ITIMER) {
579 mp->m_quotainfo->qi_itimelimit = newlim->d_itimer; 579 q->qi_itimelimit = newlim->d_itimer;
580 ddq->d_itimer = cpu_to_be32(newlim->d_itimer); 580 ddq->d_itimer = cpu_to_be32(newlim->d_itimer);
581 } 581 }
582 if (newlim->d_fieldmask & FS_DQ_RTBTIMER) { 582 if (newlim->d_fieldmask & FS_DQ_RTBTIMER) {
583 mp->m_quotainfo->qi_rtbtimelimit = newlim->d_rtbtimer; 583 q->qi_rtbtimelimit = newlim->d_rtbtimer;
584 ddq->d_rtbtimer = cpu_to_be32(newlim->d_rtbtimer); 584 ddq->d_rtbtimer = cpu_to_be32(newlim->d_rtbtimer);
585 } 585 }
586 if (newlim->d_fieldmask & FS_DQ_BWARNS) 586 if (newlim->d_fieldmask & FS_DQ_BWARNS)
587 mp->m_quotainfo->qi_bwarnlimit = newlim->d_bwarns; 587 q->qi_bwarnlimit = newlim->d_bwarns;
588 if (newlim->d_fieldmask & FS_DQ_IWARNS) 588 if (newlim->d_fieldmask & FS_DQ_IWARNS)
589 mp->m_quotainfo->qi_iwarnlimit = newlim->d_iwarns; 589 q->qi_iwarnlimit = newlim->d_iwarns;
590 if (newlim->d_fieldmask & FS_DQ_RTBWARNS) 590 if (newlim->d_fieldmask & FS_DQ_RTBWARNS)
591 mp->m_quotainfo->qi_rtbwarnlimit = newlim->d_rtbwarns; 591 q->qi_rtbwarnlimit = newlim->d_rtbwarns;
592 } else { 592 } else {
593 /* 593 /*
594 * If the user is now over quota, start the timelimit. 594 * If the user is now over quota, start the timelimit.
@@ -605,8 +605,9 @@ xfs_qm_scall_setqlim(
605 error = xfs_trans_commit(tp, 0); 605 error = xfs_trans_commit(tp, 0);
606 xfs_qm_dqprint(dqp); 606 xfs_qm_dqprint(dqp);
607 xfs_qm_dqrele(dqp); 607 xfs_qm_dqrele(dqp);
608 mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
609 608
609 out_unlock:
610 mutex_unlock(&q->qi_quotaofflock);
610 return error; 611 return error;
611} 612}
612 613
@@ -853,7 +854,8 @@ xfs_dqrele_inode(
853 int error; 854 int error;
854 855
855 /* skip quota inodes */ 856 /* skip quota inodes */
856 if (ip == XFS_QI_UQIP(ip->i_mount) || ip == XFS_QI_GQIP(ip->i_mount)) { 857 if (ip == ip->i_mount->m_quotainfo->qi_uquotaip ||
858 ip == ip->i_mount->m_quotainfo->qi_gquotaip) {
857 ASSERT(ip->i_udquot == NULL); 859 ASSERT(ip->i_udquot == NULL);
858 ASSERT(ip->i_gdquot == NULL); 860 ASSERT(ip->i_gdquot == NULL);
859 read_unlock(&pag->pag_ici_lock); 861 read_unlock(&pag->pag_ici_lock);
@@ -891,7 +893,8 @@ xfs_qm_dqrele_all_inodes(
891 uint flags) 893 uint flags)
892{ 894{
893 ASSERT(mp->m_quotainfo); 895 ASSERT(mp->m_quotainfo);
894 xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG, 0); 896 xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags,
897 XFS_ICI_NO_TAG, 0, NULL);
895} 898}
896 899
897/*------------------------------------------------------------------------*/ 900/*------------------------------------------------------------------------*/
@@ -930,7 +933,8 @@ struct mutex qcheck_lock;
930} 933}
931 934
932typedef struct dqtest { 935typedef struct dqtest {
933 xfs_dqmarker_t q_lists; 936 uint dq_flags; /* various flags (XFS_DQ_*) */
937 struct list_head q_hashlist;
934 xfs_dqhash_t *q_hash; /* the hashchain header */ 938 xfs_dqhash_t *q_hash; /* the hashchain header */
935 xfs_mount_t *q_mount; /* filesystem this relates to */ 939 xfs_mount_t *q_mount; /* filesystem this relates to */
936 xfs_dqid_t d_id; /* user id or group id */ 940 xfs_dqid_t d_id; /* user id or group id */
@@ -941,14 +945,9 @@ typedef struct dqtest {
941STATIC void 945STATIC void
942xfs_qm_hashinsert(xfs_dqhash_t *h, xfs_dqtest_t *dqp) 946xfs_qm_hashinsert(xfs_dqhash_t *h, xfs_dqtest_t *dqp)
943{ 947{
944 xfs_dquot_t *d; 948 list_add(&dqp->q_hashlist, &h->qh_list);
945 if (((d) = (h)->qh_next)) 949 h->qh_version++;
946 (d)->HL_PREVP = &((dqp)->HL_NEXT); 950 h->qh_nelems++;
947 (dqp)->HL_NEXT = d;
948 (dqp)->HL_PREVP = &((h)->qh_next);
949 (h)->qh_next = (xfs_dquot_t *)dqp;
950 (h)->qh_version++;
951 (h)->qh_nelems++;
952} 951}
953STATIC void 952STATIC void
954xfs_qm_dqtest_print( 953xfs_qm_dqtest_print(
@@ -1060,9 +1059,7 @@ xfs_qm_internalqcheck_dqget(
1060 xfs_dqhash_t *h; 1059 xfs_dqhash_t *h;
1061 1060
1062 h = DQTEST_HASH(mp, id, type); 1061 h = DQTEST_HASH(mp, id, type);
1063 for (d = (xfs_dqtest_t *) h->qh_next; d != NULL; 1062 list_for_each_entry(d, &h->qh_list, q_hashlist) {
1064 d = (xfs_dqtest_t *) d->HL_NEXT) {
1065 /* DQTEST_LIST_PRINT(h, HL_NEXT, "@@@@@ dqtestlist @@@@@"); */
1066 if (d->d_id == id && mp == d->q_mount) { 1063 if (d->d_id == id && mp == d->q_mount) {
1067 *O_dq = d; 1064 *O_dq = d;
1068 return (0); 1065 return (0);
@@ -1073,6 +1070,7 @@ xfs_qm_internalqcheck_dqget(
1073 d->d_id = id; 1070 d->d_id = id;
1074 d->q_mount = mp; 1071 d->q_mount = mp;
1075 d->q_hash = h; 1072 d->q_hash = h;
1073 INIT_LIST_HEAD(&d->q_hashlist);
1076 xfs_qm_hashinsert(h, d); 1074 xfs_qm_hashinsert(h, d);
1077 *O_dq = d; 1075 *O_dq = d;
1078 return (0); 1076 return (0);
@@ -1179,8 +1177,6 @@ xfs_qm_internalqcheck(
1179 xfs_ino_t lastino; 1177 xfs_ino_t lastino;
1180 int done, count; 1178 int done, count;
1181 int i; 1179 int i;
1182 xfs_dqtest_t *d, *e;
1183 xfs_dqhash_t *h1;
1184 int error; 1180 int error;
1185 1181
1186 lastino = 0; 1182 lastino = 0;
@@ -1220,19 +1216,18 @@ xfs_qm_internalqcheck(
1220 } 1216 }
1221 cmn_err(CE_DEBUG, "Checking results against system dquots"); 1217 cmn_err(CE_DEBUG, "Checking results against system dquots");
1222 for (i = 0; i < qmtest_hashmask; i++) { 1218 for (i = 0; i < qmtest_hashmask; i++) {
1223 h1 = &qmtest_udqtab[i]; 1219 xfs_dqtest_t *d, *n;
1224 for (d = (xfs_dqtest_t *) h1->qh_next; d != NULL; ) { 1220 xfs_dqhash_t *h;
1221
1222 h = &qmtest_udqtab[i];
1223 list_for_each_entry_safe(d, n, &h->qh_list, q_hashlist) {
1225 xfs_dqtest_cmp(d); 1224 xfs_dqtest_cmp(d);
1226 e = (xfs_dqtest_t *) d->HL_NEXT;
1227 kmem_free(d); 1225 kmem_free(d);
1228 d = e;
1229 } 1226 }
1230 h1 = &qmtest_gdqtab[i]; 1227 h = &qmtest_gdqtab[i];
1231 for (d = (xfs_dqtest_t *) h1->qh_next; d != NULL; ) { 1228 list_for_each_entry_safe(d, n, &h->qh_list, q_hashlist) {
1232 xfs_dqtest_cmp(d); 1229 xfs_dqtest_cmp(d);
1233 e = (xfs_dqtest_t *) d->HL_NEXT;
1234 kmem_free(d); 1230 kmem_free(d);
1235 d = e;
1236 } 1231 }
1237 } 1232 }
1238 1233
diff --git a/fs/xfs/quota/xfs_quota_priv.h b/fs/xfs/quota/xfs_quota_priv.h
index 8286b2842b6b..94a3d927d716 100644
--- a/fs/xfs/quota/xfs_quota_priv.h
+++ b/fs/xfs/quota/xfs_quota_priv.h
@@ -24,43 +24,6 @@
24 */ 24 */
25#define XFS_DQITER_MAP_SIZE 10 25#define XFS_DQITER_MAP_SIZE 10
26 26
27/* Number of dquots that fit in to a dquot block */
28#define XFS_QM_DQPERBLK(mp) ((mp)->m_quotainfo->qi_dqperchunk)
29
30#define XFS_DQ_IS_ADDEDTO_TRX(t, d) ((d)->q_transp == (t))
31
32#define XFS_QI_MPLRECLAIMS(mp) ((mp)->m_quotainfo->qi_dqreclaims)
33#define XFS_QI_UQIP(mp) ((mp)->m_quotainfo->qi_uquotaip)
34#define XFS_QI_GQIP(mp) ((mp)->m_quotainfo->qi_gquotaip)
35#define XFS_QI_DQCHUNKLEN(mp) ((mp)->m_quotainfo->qi_dqchunklen)
36#define XFS_QI_BTIMELIMIT(mp) ((mp)->m_quotainfo->qi_btimelimit)
37#define XFS_QI_RTBTIMELIMIT(mp) ((mp)->m_quotainfo->qi_rtbtimelimit)
38#define XFS_QI_ITIMELIMIT(mp) ((mp)->m_quotainfo->qi_itimelimit)
39#define XFS_QI_BWARNLIMIT(mp) ((mp)->m_quotainfo->qi_bwarnlimit)
40#define XFS_QI_RTBWARNLIMIT(mp) ((mp)->m_quotainfo->qi_rtbwarnlimit)
41#define XFS_QI_IWARNLIMIT(mp) ((mp)->m_quotainfo->qi_iwarnlimit)
42#define XFS_QI_QOFFLOCK(mp) ((mp)->m_quotainfo->qi_quotaofflock)
43
44#define XFS_QI_MPL_LIST(mp) ((mp)->m_quotainfo->qi_dqlist)
45#define XFS_QI_MPLNEXT(mp) ((mp)->m_quotainfo->qi_dqlist.qh_next)
46#define XFS_QI_MPLNDQUOTS(mp) ((mp)->m_quotainfo->qi_dqlist.qh_nelems)
47
48#define xfs_qm_mplist_lock(mp) \
49 mutex_lock(&(XFS_QI_MPL_LIST(mp).qh_lock))
50#define xfs_qm_mplist_nowait(mp) \
51 mutex_trylock(&(XFS_QI_MPL_LIST(mp).qh_lock))
52#define xfs_qm_mplist_unlock(mp) \
53 mutex_unlock(&(XFS_QI_MPL_LIST(mp).qh_lock))
54#define XFS_QM_IS_MPLIST_LOCKED(mp) \
55 mutex_is_locked(&(XFS_QI_MPL_LIST(mp).qh_lock))
56
57#define xfs_qm_freelist_lock(qm) \
58 mutex_lock(&((qm)->qm_dqfreelist.qh_lock))
59#define xfs_qm_freelist_lock_nowait(qm) \
60 mutex_trylock(&((qm)->qm_dqfreelist.qh_lock))
61#define xfs_qm_freelist_unlock(qm) \
62 mutex_unlock(&((qm)->qm_dqfreelist.qh_lock))
63
64/* 27/*
65 * Hash into a bucket in the dquot hash table, based on <mp, id>. 28 * Hash into a bucket in the dquot hash table, based on <mp, id>.
66 */ 29 */
@@ -72,9 +35,6 @@
72 XFS_DQ_HASHVAL(mp, id)) : \ 35 XFS_DQ_HASHVAL(mp, id)) : \
73 (xfs_Gqm->qm_grp_dqhtable + \ 36 (xfs_Gqm->qm_grp_dqhtable + \
74 XFS_DQ_HASHVAL(mp, id))) 37 XFS_DQ_HASHVAL(mp, id)))
75#define XFS_IS_DQTYPE_ON(mp, type) (type == XFS_DQ_USER ? \
76 XFS_IS_UQUOTA_ON(mp) : \
77 XFS_IS_OQUOTA_ON(mp))
78#define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \ 38#define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \
79 !dqp->q_core.d_blk_hardlimit && \ 39 !dqp->q_core.d_blk_hardlimit && \
80 !dqp->q_core.d_blk_softlimit && \ 40 !dqp->q_core.d_blk_softlimit && \
@@ -86,68 +46,6 @@
86 !dqp->q_core.d_rtbcount && \ 46 !dqp->q_core.d_rtbcount && \
87 !dqp->q_core.d_icount) 47 !dqp->q_core.d_icount)
88 48
89#define HL_PREVP dq_hashlist.ql_prevp
90#define HL_NEXT dq_hashlist.ql_next
91#define MPL_PREVP dq_mplist.ql_prevp
92#define MPL_NEXT dq_mplist.ql_next
93
94
95#define _LIST_REMOVE(h, dqp, PVP, NXT) \
96 { \
97 xfs_dquot_t *d; \
98 if (((d) = (dqp)->NXT)) \
99 (d)->PVP = (dqp)->PVP; \
100 *((dqp)->PVP) = d; \
101 (dqp)->NXT = NULL; \
102 (dqp)->PVP = NULL; \
103 (h)->qh_version++; \
104 (h)->qh_nelems--; \
105 }
106
107#define _LIST_INSERT(h, dqp, PVP, NXT) \
108 { \
109 xfs_dquot_t *d; \
110 if (((d) = (h)->qh_next)) \
111 (d)->PVP = &((dqp)->NXT); \
112 (dqp)->NXT = d; \
113 (dqp)->PVP = &((h)->qh_next); \
114 (h)->qh_next = dqp; \
115 (h)->qh_version++; \
116 (h)->qh_nelems++; \
117 }
118
119#define FOREACH_DQUOT_IN_MP(dqp, mp) \
120 for ((dqp) = XFS_QI_MPLNEXT(mp); (dqp) != NULL; (dqp) = (dqp)->MPL_NEXT)
121
122#define FOREACH_DQUOT_IN_FREELIST(dqp, qlist) \
123for ((dqp) = (qlist)->qh_next; (dqp) != (xfs_dquot_t *)(qlist); \
124 (dqp) = (dqp)->dq_flnext)
125
126#define XQM_HASHLIST_INSERT(h, dqp) \
127 _LIST_INSERT(h, dqp, HL_PREVP, HL_NEXT)
128
129#define XQM_FREELIST_INSERT(h, dqp) \
130 xfs_qm_freelist_append(h, dqp)
131
132#define XQM_MPLIST_INSERT(h, dqp) \
133 _LIST_INSERT(h, dqp, MPL_PREVP, MPL_NEXT)
134
135#define XQM_HASHLIST_REMOVE(h, dqp) \
136 _LIST_REMOVE(h, dqp, HL_PREVP, HL_NEXT)
137#define XQM_FREELIST_REMOVE(dqp) \
138 xfs_qm_freelist_unlink(dqp)
139#define XQM_MPLIST_REMOVE(h, dqp) \
140 { _LIST_REMOVE(h, dqp, MPL_PREVP, MPL_NEXT); \
141 XFS_QI_MPLRECLAIMS((dqp)->q_mount)++; }
142
143#define XFS_DQ_IS_LOGITEM_INITD(dqp) ((dqp)->q_logitem.qli_dquot == (dqp))
144
145#define XFS_QM_DQP_TO_DQACCT(tp, dqp) (XFS_QM_ISUDQ(dqp) ? \
146 (tp)->t_dqinfo->dqa_usrdquots : \
147 (tp)->t_dqinfo->dqa_grpdquots)
148#define XFS_IS_SUSER_DQUOT(dqp) \
149 (!((dqp)->q_core.d_id))
150
151#define DQFLAGTO_TYPESTR(d) (((d)->dq_flags & XFS_DQ_USER) ? "USR" : \ 49#define DQFLAGTO_TYPESTR(d) (((d)->dq_flags & XFS_DQ_USER) ? "USR" : \
152 (((d)->dq_flags & XFS_DQ_GROUP) ? "GRP" : \ 50 (((d)->dq_flags & XFS_DQ_GROUP) ? "GRP" : \
153 (((d)->dq_flags & XFS_DQ_PROJ) ? "PRJ":"???"))) 51 (((d)->dq_flags & XFS_DQ_PROJ) ? "PRJ":"???")))
diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c
index c3ab75cb1d9a..061d827da33c 100644
--- a/fs/xfs/quota/xfs_trans_dquot.c
+++ b/fs/xfs/quota/xfs_trans_dquot.c
@@ -59,12 +59,11 @@ xfs_trans_dqjoin(
59 xfs_trans_t *tp, 59 xfs_trans_t *tp,
60 xfs_dquot_t *dqp) 60 xfs_dquot_t *dqp)
61{ 61{
62 xfs_dq_logitem_t *lp; 62 xfs_dq_logitem_t *lp = &dqp->q_logitem;
63 63
64 ASSERT(! XFS_DQ_IS_ADDEDTO_TRX(tp, dqp)); 64 ASSERT(dqp->q_transp != tp);
65 ASSERT(XFS_DQ_IS_LOCKED(dqp)); 65 ASSERT(XFS_DQ_IS_LOCKED(dqp));
66 ASSERT(XFS_DQ_IS_LOGITEM_INITD(dqp)); 66 ASSERT(lp->qli_dquot == dqp);
67 lp = &dqp->q_logitem;
68 67
69 /* 68 /*
70 * Get a log_item_desc to point at the new item. 69 * Get a log_item_desc to point at the new item.
@@ -96,7 +95,7 @@ xfs_trans_log_dquot(
96{ 95{
97 xfs_log_item_desc_t *lidp; 96 xfs_log_item_desc_t *lidp;
98 97
99 ASSERT(XFS_DQ_IS_ADDEDTO_TRX(tp, dqp)); 98 ASSERT(dqp->q_transp == tp);
100 ASSERT(XFS_DQ_IS_LOCKED(dqp)); 99 ASSERT(XFS_DQ_IS_LOCKED(dqp));
101 100
102 lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)(&dqp->q_logitem)); 101 lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)(&dqp->q_logitem));
@@ -198,16 +197,16 @@ xfs_trans_get_dqtrx(
198 int i; 197 int i;
199 xfs_dqtrx_t *qa; 198 xfs_dqtrx_t *qa;
200 199
201 for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { 200 qa = XFS_QM_ISUDQ(dqp) ?
202 qa = XFS_QM_DQP_TO_DQACCT(tp, dqp); 201 tp->t_dqinfo->dqa_usrdquots : tp->t_dqinfo->dqa_grpdquots;
203 202
203 for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
204 if (qa[i].qt_dquot == NULL || 204 if (qa[i].qt_dquot == NULL ||
205 qa[i].qt_dquot == dqp) { 205 qa[i].qt_dquot == dqp)
206 return (&qa[i]); 206 return &qa[i];
207 }
208 } 207 }
209 208
210 return (NULL); 209 return NULL;
211} 210}
212 211
213/* 212/*
@@ -381,7 +380,7 @@ xfs_trans_apply_dquot_deltas(
381 break; 380 break;
382 381
383 ASSERT(XFS_DQ_IS_LOCKED(dqp)); 382 ASSERT(XFS_DQ_IS_LOCKED(dqp));
384 ASSERT(XFS_DQ_IS_ADDEDTO_TRX(tp, dqp)); 383 ASSERT(dqp->q_transp == tp);
385 384
386 /* 385 /*
387 * adjust the actual number of blocks used 386 * adjust the actual number of blocks used
@@ -639,7 +638,7 @@ xfs_trans_dqresv(
639 softlimit = q->qi_bsoftlimit; 638 softlimit = q->qi_bsoftlimit;
640 timer = be32_to_cpu(dqp->q_core.d_btimer); 639 timer = be32_to_cpu(dqp->q_core.d_btimer);
641 warns = be16_to_cpu(dqp->q_core.d_bwarns); 640 warns = be16_to_cpu(dqp->q_core.d_bwarns);
642 warnlimit = XFS_QI_BWARNLIMIT(dqp->q_mount); 641 warnlimit = dqp->q_mount->m_quotainfo->qi_bwarnlimit;
643 resbcountp = &dqp->q_res_bcount; 642 resbcountp = &dqp->q_res_bcount;
644 } else { 643 } else {
645 ASSERT(flags & XFS_TRANS_DQ_RES_RTBLKS); 644 ASSERT(flags & XFS_TRANS_DQ_RES_RTBLKS);
@@ -651,7 +650,7 @@ xfs_trans_dqresv(
651 softlimit = q->qi_rtbsoftlimit; 650 softlimit = q->qi_rtbsoftlimit;
652 timer = be32_to_cpu(dqp->q_core.d_rtbtimer); 651 timer = be32_to_cpu(dqp->q_core.d_rtbtimer);
653 warns = be16_to_cpu(dqp->q_core.d_rtbwarns); 652 warns = be16_to_cpu(dqp->q_core.d_rtbwarns);
654 warnlimit = XFS_QI_RTBWARNLIMIT(dqp->q_mount); 653 warnlimit = dqp->q_mount->m_quotainfo->qi_rtbwarnlimit;
655 resbcountp = &dqp->q_res_rtbcount; 654 resbcountp = &dqp->q_res_rtbcount;
656 } 655 }
657 656
@@ -691,7 +690,7 @@ xfs_trans_dqresv(
691 count = be64_to_cpu(dqp->q_core.d_icount); 690 count = be64_to_cpu(dqp->q_core.d_icount);
692 timer = be32_to_cpu(dqp->q_core.d_itimer); 691 timer = be32_to_cpu(dqp->q_core.d_itimer);
693 warns = be16_to_cpu(dqp->q_core.d_iwarns); 692 warns = be16_to_cpu(dqp->q_core.d_iwarns);
694 warnlimit = XFS_QI_IWARNLIMIT(dqp->q_mount); 693 warnlimit = dqp->q_mount->m_quotainfo->qi_iwarnlimit;
695 hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit); 694 hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit);
696 if (!hardlimit) 695 if (!hardlimit)
697 hardlimit = q->qi_ihardlimit; 696 hardlimit = q->qi_ihardlimit;
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index d13eeba2c8f8..0135e2a669d7 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -49,8 +49,8 @@ extern int xfs_acl_chmod(struct inode *inode);
49extern int posix_acl_access_exists(struct inode *inode); 49extern int posix_acl_access_exists(struct inode *inode);
50extern int posix_acl_default_exists(struct inode *inode); 50extern int posix_acl_default_exists(struct inode *inode);
51 51
52extern struct xattr_handler xfs_xattr_acl_access_handler; 52extern const struct xattr_handler xfs_xattr_acl_access_handler;
53extern struct xattr_handler xfs_xattr_acl_default_handler; 53extern const struct xattr_handler xfs_xattr_acl_default_handler;
54#else 54#else
55# define xfs_check_acl NULL 55# define xfs_check_acl NULL
56# define xfs_get_acl(inode, type) NULL 56# define xfs_get_acl(inode, type) NULL
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index b1a5a1ff88ea..401f364ad36c 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -175,14 +175,20 @@ typedef struct xfs_agfl {
175} xfs_agfl_t; 175} xfs_agfl_t;
176 176
177/* 177/*
178 * Busy block/extent entry. Used in perag to mark blocks that have been freed 178 * Busy block/extent entry. Indexed by a rbtree in perag to mark blocks that
179 * but whose transactions aren't committed to disk yet. 179 * have been freed but whose transactions aren't committed to disk yet.
180 *
181 * Note that we use the transaction ID to record the transaction, not the
182 * transaction structure itself. See xfs_alloc_busy_insert() for details.
180 */ 183 */
181typedef struct xfs_perag_busy { 184struct xfs_busy_extent {
182 xfs_agblock_t busy_start; 185 struct rb_node rb_node; /* ag by-bno indexed search tree */
183 xfs_extlen_t busy_length; 186 struct list_head list; /* transaction busy extent list */
184 struct xfs_trans *busy_tp; /* transaction that did the free */ 187 xfs_agnumber_t agno;
185} xfs_perag_busy_t; 188 xfs_agblock_t bno;
189 xfs_extlen_t length;
190 xlog_tid_t tid; /* transaction that created this */
191};
186 192
187/* 193/*
188 * Per-ag incore structure, copies of information in agf and agi, 194 * Per-ag incore structure, copies of information in agf and agi,
@@ -216,16 +222,17 @@ typedef struct xfs_perag {
216 xfs_agino_t pagl_leftrec; 222 xfs_agino_t pagl_leftrec;
217 xfs_agino_t pagl_rightrec; 223 xfs_agino_t pagl_rightrec;
218#ifdef __KERNEL__ 224#ifdef __KERNEL__
219 spinlock_t pagb_lock; /* lock for pagb_list */ 225 spinlock_t pagb_lock; /* lock for pagb_tree */
226 struct rb_root pagb_tree; /* ordered tree of busy extents */
220 227
221 atomic_t pagf_fstrms; /* # of filestreams active in this AG */ 228 atomic_t pagf_fstrms; /* # of filestreams active in this AG */
222 229
223 int pag_ici_init; /* incore inode cache initialised */ 230 int pag_ici_init; /* incore inode cache initialised */
224 rwlock_t pag_ici_lock; /* incore inode lock */ 231 rwlock_t pag_ici_lock; /* incore inode lock */
225 struct radix_tree_root pag_ici_root; /* incore inode cache root */ 232 struct radix_tree_root pag_ici_root; /* incore inode cache root */
233 int pag_ici_reclaimable; /* reclaimable inodes */
226#endif 234#endif
227 int pagb_count; /* pagb slots in use */ 235 int pagb_count; /* pagb slots in use */
228 xfs_perag_busy_t pagb_list[XFS_PAGB_NUM_SLOTS]; /* unstable blocks */
229} xfs_perag_t; 236} xfs_perag_t;
230 237
231/* 238/*
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 94cddbfb2560..a7fbe8a99b12 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -46,11 +46,9 @@
46#define XFSA_FIXUP_BNO_OK 1 46#define XFSA_FIXUP_BNO_OK 1
47#define XFSA_FIXUP_CNT_OK 2 47#define XFSA_FIXUP_CNT_OK 2
48 48
49STATIC void 49static int
50xfs_alloc_search_busy(xfs_trans_t *tp, 50xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
51 xfs_agnumber_t agno, 51 xfs_agblock_t bno, xfs_extlen_t len);
52 xfs_agblock_t bno,
53 xfs_extlen_t len);
54 52
55/* 53/*
56 * Prototypes for per-ag allocation routines 54 * Prototypes for per-ag allocation routines
@@ -540,9 +538,16 @@ xfs_alloc_ag_vextent(
540 be32_to_cpu(agf->agf_length)); 538 be32_to_cpu(agf->agf_length));
541 xfs_alloc_log_agf(args->tp, args->agbp, 539 xfs_alloc_log_agf(args->tp, args->agbp,
542 XFS_AGF_FREEBLKS); 540 XFS_AGF_FREEBLKS);
543 /* search the busylist for these blocks */ 541 /*
544 xfs_alloc_search_busy(args->tp, args->agno, 542 * Search the busylist for these blocks and mark the
545 args->agbno, args->len); 543 * transaction as synchronous if blocks are found. This
544 * avoids the need to block due to a synchronous log
545 * force to ensure correct ordering as the synchronous
546 * transaction will guarantee that for us.
547 */
548 if (xfs_alloc_busy_search(args->mp, args->agno,
549 args->agbno, args->len))
550 xfs_trans_set_sync(args->tp);
546 } 551 }
547 if (!args->isfl) 552 if (!args->isfl)
548 xfs_trans_mod_sb(args->tp, 553 xfs_trans_mod_sb(args->tp,
@@ -1693,7 +1698,7 @@ xfs_free_ag_extent(
1693 * when the iclog commits to disk. If a busy block is allocated, 1698 * when the iclog commits to disk. If a busy block is allocated,
1694 * the iclog is pushed up to the LSN that freed the block. 1699 * the iclog is pushed up to the LSN that freed the block.
1695 */ 1700 */
1696 xfs_alloc_mark_busy(tp, agno, bno, len); 1701 xfs_alloc_busy_insert(tp, agno, bno, len);
1697 return 0; 1702 return 0;
1698 1703
1699 error0: 1704 error0:
@@ -1989,14 +1994,20 @@ xfs_alloc_get_freelist(
1989 *bnop = bno; 1994 *bnop = bno;
1990 1995
1991 /* 1996 /*
1992 * As blocks are freed, they are added to the per-ag busy list 1997 * As blocks are freed, they are added to the per-ag busy list and
1993 * and remain there until the freeing transaction is committed to 1998 * remain there until the freeing transaction is committed to disk.
1994 * disk. Now that we have allocated blocks, this list must be 1999 * Now that we have allocated blocks, this list must be searched to see
1995 * searched to see if a block is being reused. If one is, then 2000 * if a block is being reused. If one is, then the freeing transaction
1996 * the freeing transaction must be pushed to disk NOW by forcing 2001 * must be pushed to disk before this transaction.
1997 * to disk all iclogs up that transaction's LSN. 2002 *
2003 * We do this by setting the current transaction to a sync transaction
2004 * which guarantees that the freeing transaction is on disk before this
2005 * transaction. This is done instead of a synchronous log force here so
2006 * that we don't sit and wait with the AGF locked in the transaction
2007 * during the log force.
1998 */ 2008 */
1999 xfs_alloc_search_busy(tp, be32_to_cpu(agf->agf_seqno), bno, 1); 2009 if (xfs_alloc_busy_search(mp, be32_to_cpu(agf->agf_seqno), bno, 1))
2010 xfs_trans_set_sync(tp);
2000 return 0; 2011 return 0;
2001} 2012}
2002 2013
@@ -2201,7 +2212,7 @@ xfs_alloc_read_agf(
2201 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]); 2212 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
2202 spin_lock_init(&pag->pagb_lock); 2213 spin_lock_init(&pag->pagb_lock);
2203 pag->pagb_count = 0; 2214 pag->pagb_count = 0;
2204 memset(pag->pagb_list, 0, sizeof(pag->pagb_list)); 2215 pag->pagb_tree = RB_ROOT;
2205 pag->pagf_init = 1; 2216 pag->pagf_init = 1;
2206 } 2217 }
2207#ifdef DEBUG 2218#ifdef DEBUG
@@ -2479,127 +2490,263 @@ error0:
2479 * list is reused, the transaction that freed it must be forced to disk 2490 * list is reused, the transaction that freed it must be forced to disk
2480 * before continuing to use the block. 2491 * before continuing to use the block.
2481 * 2492 *
2482 * xfs_alloc_mark_busy - add to the per-ag busy list 2493 * xfs_alloc_busy_insert - add to the per-ag busy list
2483 * xfs_alloc_clear_busy - remove an item from the per-ag busy list 2494 * xfs_alloc_busy_clear - remove an item from the per-ag busy list
2495 * xfs_alloc_busy_search - search for a busy extent
2496 */
2497
2498/*
2499 * Insert a new extent into the busy tree.
2500 *
2501 * The busy extent tree is indexed by the start block of the busy extent.
2502 * there can be multiple overlapping ranges in the busy extent tree but only
2503 * ever one entry at a given start block. The reason for this is that
2504 * multi-block extents can be freed, then smaller chunks of that extent
2505 * allocated and freed again before the first transaction commit is on disk.
2506 * If the exact same start block is freed a second time, we have to wait for
2507 * that busy extent to pass out of the tree before the new extent is inserted.
2508 * There are two main cases we have to handle here.
2509 *
2510 * The first case is a transaction that triggers a "free - allocate - free"
2511 * cycle. This can occur during btree manipulations as a btree block is freed
2512 * to the freelist, then allocated from the free list, then freed again. In
2513 * this case, the second extxpnet free is what triggers the duplicate and as
2514 * such the transaction IDs should match. Because the extent was allocated in
2515 * this transaction, the transaction must be marked as synchronous. This is
2516 * true for all cases where the free/alloc/free occurs in the one transaction,
2517 * hence the addition of the ASSERT(tp->t_flags & XFS_TRANS_SYNC) to this case.
2518 * This serves to catch violations of the second case quite effectively.
2519 *
2520 * The second case is where the free/alloc/free occur in different
2521 * transactions. In this case, the thread freeing the extent the second time
2522 * can't mark the extent busy immediately because it is already tracked in a
2523 * transaction that may be committing. When the log commit for the existing
2524 * busy extent completes, the busy extent will be removed from the tree. If we
2525 * allow the second busy insert to continue using that busy extent structure,
2526 * it can be freed before this transaction is safely in the log. Hence our
2527 * only option in this case is to force the log to remove the existing busy
2528 * extent from the list before we insert the new one with the current
2529 * transaction ID.
2530 *
2531 * The problem we are trying to avoid in the free-alloc-free in separate
2532 * transactions is most easily described with a timeline:
2533 *
2534 * Thread 1 Thread 2 Thread 3 xfslogd
2535 * xact alloc
2536 * free X
2537 * mark busy
2538 * commit xact
2539 * free xact
2540 * xact alloc
2541 * alloc X
2542 * busy search
2543 * mark xact sync
2544 * commit xact
2545 * free xact
2546 * force log
2547 * checkpoint starts
2548 * ....
2549 * xact alloc
2550 * free X
2551 * mark busy
2552 * finds match
2553 * *** KABOOM! ***
2554 * ....
2555 * log IO completes
2556 * unbusy X
2557 * checkpoint completes
2558 *
2559 * By issuing a log force in thread 3 @ "KABOOM", the thread will block until
2560 * the checkpoint completes, and the busy extent it matched will have been
2561 * removed from the tree when it is woken. Hence it can then continue safely.
2562 *
2563 * However, to ensure this matching process is robust, we need to use the
2564 * transaction ID for identifying transaction, as delayed logging results in
2565 * the busy extent and transaction lifecycles being different. i.e. the busy
2566 * extent is active for a lot longer than the transaction. Hence the
2567 * transaction structure can be freed and reallocated, then mark the same
2568 * extent busy again in the new transaction. In this case the new transaction
2569 * will have a different tid but can have the same address, and hence we need
2570 * to check against the tid.
2571 *
2572 * Future: for delayed logging, we could avoid the log force if the extent was
2573 * first freed in the current checkpoint sequence. This, however, requires the
2574 * ability to pin the current checkpoint in memory until this transaction
2575 * commits to ensure that both the original free and the current one combine
2576 * logically into the one checkpoint. If the checkpoint sequences are
2577 * different, however, we still need to wait on a log force.
2484 */ 2578 */
2485void 2579void
2486xfs_alloc_mark_busy(xfs_trans_t *tp, 2580xfs_alloc_busy_insert(
2487 xfs_agnumber_t agno, 2581 struct xfs_trans *tp,
2488 xfs_agblock_t bno, 2582 xfs_agnumber_t agno,
2489 xfs_extlen_t len) 2583 xfs_agblock_t bno,
2584 xfs_extlen_t len)
2490{ 2585{
2491 xfs_perag_busy_t *bsy; 2586 struct xfs_busy_extent *new;
2587 struct xfs_busy_extent *busyp;
2492 struct xfs_perag *pag; 2588 struct xfs_perag *pag;
2493 int n; 2589 struct rb_node **rbp;
2590 struct rb_node *parent;
2591 int match;
2494 2592
2495 pag = xfs_perag_get(tp->t_mountp, agno);
2496 spin_lock(&pag->pagb_lock);
2497 2593
2498 /* search pagb_list for an open slot */ 2594 new = kmem_zalloc(sizeof(struct xfs_busy_extent), KM_MAYFAIL);
2499 for (bsy = pag->pagb_list, n = 0; 2595 if (!new) {
2500 n < XFS_PAGB_NUM_SLOTS; 2596 /*
2501 bsy++, n++) { 2597 * No Memory! Since it is now not possible to track the free
2502 if (bsy->busy_tp == NULL) { 2598 * block, make this a synchronous transaction to insure that
2503 break; 2599 * the block is not reused before this transaction commits.
2504 } 2600 */
2601 trace_xfs_alloc_busy(tp, agno, bno, len, 1);
2602 xfs_trans_set_sync(tp);
2603 return;
2505 } 2604 }
2506 2605
2507 trace_xfs_alloc_busy(tp->t_mountp, agno, bno, len, n); 2606 new->agno = agno;
2607 new->bno = bno;
2608 new->length = len;
2609 new->tid = xfs_log_get_trans_ident(tp);
2508 2610
2509 if (n < XFS_PAGB_NUM_SLOTS) { 2611 INIT_LIST_HEAD(&new->list);
2510 bsy = &pag->pagb_list[n]; 2612
2511 pag->pagb_count++; 2613 /* trace before insert to be able to see failed inserts */
2512 bsy->busy_start = bno; 2614 trace_xfs_alloc_busy(tp, agno, bno, len, 0);
2513 bsy->busy_length = len; 2615
2514 bsy->busy_tp = tp; 2616 pag = xfs_perag_get(tp->t_mountp, new->agno);
2515 xfs_trans_add_busy(tp, agno, n); 2617restart:
2516 } else { 2618 spin_lock(&pag->pagb_lock);
2619 rbp = &pag->pagb_tree.rb_node;
2620 parent = NULL;
2621 busyp = NULL;
2622 match = 0;
2623 while (*rbp && match >= 0) {
2624 parent = *rbp;
2625 busyp = rb_entry(parent, struct xfs_busy_extent, rb_node);
2626
2627 if (new->bno < busyp->bno) {
2628 /* may overlap, but exact start block is lower */
2629 rbp = &(*rbp)->rb_left;
2630 if (new->bno + new->length > busyp->bno)
2631 match = busyp->tid == new->tid ? 1 : -1;
2632 } else if (new->bno > busyp->bno) {
2633 /* may overlap, but exact start block is higher */
2634 rbp = &(*rbp)->rb_right;
2635 if (bno < busyp->bno + busyp->length)
2636 match = busyp->tid == new->tid ? 1 : -1;
2637 } else {
2638 match = busyp->tid == new->tid ? 1 : -1;
2639 break;
2640 }
2641 }
2642 if (match < 0) {
2643 /* overlap marked busy in different transaction */
2644 spin_unlock(&pag->pagb_lock);
2645 xfs_log_force(tp->t_mountp, XFS_LOG_SYNC);
2646 goto restart;
2647 }
2648 if (match > 0) {
2517 /* 2649 /*
2518 * The busy list is full! Since it is now not possible to 2650 * overlap marked busy in same transaction. Update if exact
2519 * track the free block, make this a synchronous transaction 2651 * start block match, otherwise combine the busy extents into
2520 * to insure that the block is not reused before this 2652 * a single range.
2521 * transaction commits.
2522 */ 2653 */
2523 xfs_trans_set_sync(tp); 2654 if (busyp->bno == new->bno) {
2524 } 2655 busyp->length = max(busyp->length, new->length);
2656 spin_unlock(&pag->pagb_lock);
2657 ASSERT(tp->t_flags & XFS_TRANS_SYNC);
2658 xfs_perag_put(pag);
2659 kmem_free(new);
2660 return;
2661 }
2662 rb_erase(&busyp->rb_node, &pag->pagb_tree);
2663 new->length = max(busyp->bno + busyp->length,
2664 new->bno + new->length) -
2665 min(busyp->bno, new->bno);
2666 new->bno = min(busyp->bno, new->bno);
2667 } else
2668 busyp = NULL;
2525 2669
2670 rb_link_node(&new->rb_node, parent, rbp);
2671 rb_insert_color(&new->rb_node, &pag->pagb_tree);
2672
2673 list_add(&new->list, &tp->t_busy);
2526 spin_unlock(&pag->pagb_lock); 2674 spin_unlock(&pag->pagb_lock);
2527 xfs_perag_put(pag); 2675 xfs_perag_put(pag);
2676 kmem_free(busyp);
2528} 2677}
2529 2678
2530void 2679/*
2531xfs_alloc_clear_busy(xfs_trans_t *tp, 2680 * Search for a busy extent within the range of the extent we are about to
2532 xfs_agnumber_t agno, 2681 * allocate. You need to be holding the busy extent tree lock when calling
2533 int idx) 2682 * xfs_alloc_busy_search(). This function returns 0 for no overlapping busy
2683 * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact
2684 * match. This is done so that a non-zero return indicates an overlap that
2685 * will require a synchronous transaction, but it can still be
2686 * used to distinguish between a partial or exact match.
2687 */
2688static int
2689xfs_alloc_busy_search(
2690 struct xfs_mount *mp,
2691 xfs_agnumber_t agno,
2692 xfs_agblock_t bno,
2693 xfs_extlen_t len)
2534{ 2694{
2535 struct xfs_perag *pag; 2695 struct xfs_perag *pag;
2536 xfs_perag_busy_t *list; 2696 struct rb_node *rbp;
2697 struct xfs_busy_extent *busyp;
2698 int match = 0;
2537 2699
2538 ASSERT(idx < XFS_PAGB_NUM_SLOTS); 2700 pag = xfs_perag_get(mp, agno);
2539 pag = xfs_perag_get(tp->t_mountp, agno);
2540 spin_lock(&pag->pagb_lock); 2701 spin_lock(&pag->pagb_lock);
2541 list = pag->pagb_list;
2542 2702
2543 trace_xfs_alloc_unbusy(tp->t_mountp, agno, idx, list[idx].busy_tp == tp); 2703 rbp = pag->pagb_tree.rb_node;
2544 2704
2545 if (list[idx].busy_tp == tp) { 2705 /* find closest start bno overlap */
2546 list[idx].busy_tp = NULL; 2706 while (rbp) {
2547 pag->pagb_count--; 2707 busyp = rb_entry(rbp, struct xfs_busy_extent, rb_node);
2708 if (bno < busyp->bno) {
2709 /* may overlap, but exact start block is lower */
2710 if (bno + len > busyp->bno)
2711 match = -1;
2712 rbp = rbp->rb_left;
2713 } else if (bno > busyp->bno) {
2714 /* may overlap, but exact start block is higher */
2715 if (bno < busyp->bno + busyp->length)
2716 match = -1;
2717 rbp = rbp->rb_right;
2718 } else {
2719 /* bno matches busyp, length determines exact match */
2720 match = (busyp->length == len) ? 1 : -1;
2721 break;
2722 }
2548 } 2723 }
2549
2550 spin_unlock(&pag->pagb_lock); 2724 spin_unlock(&pag->pagb_lock);
2725 trace_xfs_alloc_busysearch(mp, agno, bno, len, !!match);
2551 xfs_perag_put(pag); 2726 xfs_perag_put(pag);
2727 return match;
2552} 2728}
2553 2729
2554 2730void
2555/* 2731xfs_alloc_busy_clear(
2556 * If we find the extent in the busy list, force the log out to get the 2732 struct xfs_mount *mp,
2557 * extent out of the busy list so the caller can use it straight away. 2733 struct xfs_busy_extent *busyp)
2558 */
2559STATIC void
2560xfs_alloc_search_busy(xfs_trans_t *tp,
2561 xfs_agnumber_t agno,
2562 xfs_agblock_t bno,
2563 xfs_extlen_t len)
2564{ 2734{
2565 struct xfs_perag *pag; 2735 struct xfs_perag *pag;
2566 xfs_perag_busy_t *bsy;
2567 xfs_agblock_t uend, bend;
2568 xfs_lsn_t lsn = 0;
2569 int cnt;
2570 2736
2571 pag = xfs_perag_get(tp->t_mountp, agno); 2737 trace_xfs_alloc_unbusy(mp, busyp->agno, busyp->bno,
2572 spin_lock(&pag->pagb_lock); 2738 busyp->length);
2573 cnt = pag->pagb_count;
2574 2739
2575 /* 2740 ASSERT(xfs_alloc_busy_search(mp, busyp->agno, busyp->bno,
2576 * search pagb_list for this slot, skipping open slots. We have to 2741 busyp->length) == 1);
2577 * search the entire array as there may be multiple overlaps and
2578 * we have to get the most recent LSN for the log force to push out
2579 * all the transactions that span the range.
2580 */
2581 uend = bno + len - 1;
2582 for (cnt = 0; cnt < pag->pagb_count; cnt++) {
2583 bsy = &pag->pagb_list[cnt];
2584 if (!bsy->busy_tp)
2585 continue;
2586 2742
2587 bend = bsy->busy_start + bsy->busy_length - 1; 2743 list_del_init(&busyp->list);
2588 if (bno > bend || uend < bsy->busy_start)
2589 continue;
2590 2744
2591 /* (start1,length1) within (start2, length2) */ 2745 pag = xfs_perag_get(mp, busyp->agno);
2592 if (XFS_LSN_CMP(bsy->busy_tp->t_commit_lsn, lsn) > 0) 2746 spin_lock(&pag->pagb_lock);
2593 lsn = bsy->busy_tp->t_commit_lsn; 2747 rb_erase(&busyp->rb_node, &pag->pagb_tree);
2594 }
2595 spin_unlock(&pag->pagb_lock); 2748 spin_unlock(&pag->pagb_lock);
2596 xfs_perag_put(pag); 2749 xfs_perag_put(pag);
2597 trace_xfs_alloc_busysearch(tp->t_mountp, agno, bno, len, lsn);
2598 2750
2599 /* 2751 kmem_free(busyp);
2600 * If a block was found, force the log through the LSN of the
2601 * transaction that freed the block
2602 */
2603 if (lsn)
2604 xfs_log_force_lsn(tp->t_mountp, lsn, XFS_LOG_SYNC);
2605} 2752}
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 599bffa39784..6d05199b667c 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -22,6 +22,7 @@ struct xfs_buf;
22struct xfs_mount; 22struct xfs_mount;
23struct xfs_perag; 23struct xfs_perag;
24struct xfs_trans; 24struct xfs_trans;
25struct xfs_busy_extent;
25 26
26/* 27/*
27 * Freespace allocation types. Argument to xfs_alloc_[v]extent. 28 * Freespace allocation types. Argument to xfs_alloc_[v]extent.
@@ -119,15 +120,13 @@ xfs_alloc_longest_free_extent(struct xfs_mount *mp,
119#ifdef __KERNEL__ 120#ifdef __KERNEL__
120 121
121void 122void
122xfs_alloc_mark_busy(xfs_trans_t *tp, 123xfs_alloc_busy_insert(xfs_trans_t *tp,
123 xfs_agnumber_t agno, 124 xfs_agnumber_t agno,
124 xfs_agblock_t bno, 125 xfs_agblock_t bno,
125 xfs_extlen_t len); 126 xfs_extlen_t len);
126 127
127void 128void
128xfs_alloc_clear_busy(xfs_trans_t *tp, 129xfs_alloc_busy_clear(struct xfs_mount *mp, struct xfs_busy_extent *busyp);
129 xfs_agnumber_t ag,
130 int idx);
131 130
132#endif /* __KERNEL__ */ 131#endif /* __KERNEL__ */
133 132
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index b726e10d2c1c..83f494218759 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -134,7 +134,7 @@ xfs_allocbt_free_block(
134 * disk. If a busy block is allocated, the iclog is pushed up to the 134 * disk. If a busy block is allocated, the iclog is pushed up to the
135 * LSN that freed the block. 135 * LSN that freed the block.
136 */ 136 */
137 xfs_alloc_mark_busy(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1); 137 xfs_alloc_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1);
138 xfs_trans_agbtree_delta(cur->bc_tp, -1); 138 xfs_trans_agbtree_delta(cur->bc_tp, -1);
139 return 0; 139 return 0;
140} 140}
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 5c11e4d17010..99587ded043f 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -3829,7 +3829,7 @@ xfs_bmap_add_attrfork(
3829 } 3829 }
3830 if ((error = xfs_bmap_finish(&tp, &flist, &committed))) 3830 if ((error = xfs_bmap_finish(&tp, &flist, &committed)))
3831 goto error2; 3831 goto error2;
3832 error = xfs_trans_commit(tp, XFS_TRANS_PERM_LOG_RES); 3832 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
3833 ASSERT(ip->i_df.if_ext_max == 3833 ASSERT(ip->i_df.if_ext_max ==
3834 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t)); 3834 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
3835 return error; 3835 return error;
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index f3c49e69eab9..02a80984aa05 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -64,7 +64,7 @@ xfs_buf_item_log_debug(
64 nbytes = last - first + 1; 64 nbytes = last - first + 1;
65 bfset(bip->bli_logged, first, nbytes); 65 bfset(bip->bli_logged, first, nbytes);
66 for (x = 0; x < nbytes; x++) { 66 for (x = 0; x < nbytes; x++) {
67 chunk_num = byte >> XFS_BLI_SHIFT; 67 chunk_num = byte >> XFS_BLF_SHIFT;
68 word_num = chunk_num >> BIT_TO_WORD_SHIFT; 68 word_num = chunk_num >> BIT_TO_WORD_SHIFT;
69 bit_num = chunk_num & (NBWORD - 1); 69 bit_num = chunk_num & (NBWORD - 1);
70 wordp = &(bip->bli_format.blf_data_map[word_num]); 70 wordp = &(bip->bli_format.blf_data_map[word_num]);
@@ -166,7 +166,7 @@ xfs_buf_item_size(
166 * cancel flag in it. 166 * cancel flag in it.
167 */ 167 */
168 trace_xfs_buf_item_size_stale(bip); 168 trace_xfs_buf_item_size_stale(bip);
169 ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); 169 ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
170 return 1; 170 return 1;
171 } 171 }
172 172
@@ -197,9 +197,9 @@ xfs_buf_item_size(
197 } else if (next_bit != last_bit + 1) { 197 } else if (next_bit != last_bit + 1) {
198 last_bit = next_bit; 198 last_bit = next_bit;
199 nvecs++; 199 nvecs++;
200 } else if (xfs_buf_offset(bp, next_bit * XFS_BLI_CHUNK) != 200 } else if (xfs_buf_offset(bp, next_bit * XFS_BLF_CHUNK) !=
201 (xfs_buf_offset(bp, last_bit * XFS_BLI_CHUNK) + 201 (xfs_buf_offset(bp, last_bit * XFS_BLF_CHUNK) +
202 XFS_BLI_CHUNK)) { 202 XFS_BLF_CHUNK)) {
203 last_bit = next_bit; 203 last_bit = next_bit;
204 nvecs++; 204 nvecs++;
205 } else { 205 } else {
@@ -254,6 +254,20 @@ xfs_buf_item_format(
254 vecp++; 254 vecp++;
255 nvecs = 1; 255 nvecs = 1;
256 256
257 /*
258 * If it is an inode buffer, transfer the in-memory state to the
259 * format flags and clear the in-memory state. We do not transfer
260 * this state if the inode buffer allocation has not yet been committed
261 * to the log as setting the XFS_BLI_INODE_BUF flag will prevent
262 * correct replay of the inode allocation.
263 */
264 if (bip->bli_flags & XFS_BLI_INODE_BUF) {
265 if (!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
266 xfs_log_item_in_current_chkpt(&bip->bli_item)))
267 bip->bli_format.blf_flags |= XFS_BLF_INODE_BUF;
268 bip->bli_flags &= ~XFS_BLI_INODE_BUF;
269 }
270
257 if (bip->bli_flags & XFS_BLI_STALE) { 271 if (bip->bli_flags & XFS_BLI_STALE) {
258 /* 272 /*
259 * The buffer is stale, so all we need to log 273 * The buffer is stale, so all we need to log
@@ -261,7 +275,7 @@ xfs_buf_item_format(
261 * cancel flag in it. 275 * cancel flag in it.
262 */ 276 */
263 trace_xfs_buf_item_format_stale(bip); 277 trace_xfs_buf_item_format_stale(bip);
264 ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); 278 ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
265 bip->bli_format.blf_size = nvecs; 279 bip->bli_format.blf_size = nvecs;
266 return; 280 return;
267 } 281 }
@@ -294,28 +308,28 @@ xfs_buf_item_format(
294 * keep counting and scanning. 308 * keep counting and scanning.
295 */ 309 */
296 if (next_bit == -1) { 310 if (next_bit == -1) {
297 buffer_offset = first_bit * XFS_BLI_CHUNK; 311 buffer_offset = first_bit * XFS_BLF_CHUNK;
298 vecp->i_addr = xfs_buf_offset(bp, buffer_offset); 312 vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
299 vecp->i_len = nbits * XFS_BLI_CHUNK; 313 vecp->i_len = nbits * XFS_BLF_CHUNK;
300 vecp->i_type = XLOG_REG_TYPE_BCHUNK; 314 vecp->i_type = XLOG_REG_TYPE_BCHUNK;
301 nvecs++; 315 nvecs++;
302 break; 316 break;
303 } else if (next_bit != last_bit + 1) { 317 } else if (next_bit != last_bit + 1) {
304 buffer_offset = first_bit * XFS_BLI_CHUNK; 318 buffer_offset = first_bit * XFS_BLF_CHUNK;
305 vecp->i_addr = xfs_buf_offset(bp, buffer_offset); 319 vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
306 vecp->i_len = nbits * XFS_BLI_CHUNK; 320 vecp->i_len = nbits * XFS_BLF_CHUNK;
307 vecp->i_type = XLOG_REG_TYPE_BCHUNK; 321 vecp->i_type = XLOG_REG_TYPE_BCHUNK;
308 nvecs++; 322 nvecs++;
309 vecp++; 323 vecp++;
310 first_bit = next_bit; 324 first_bit = next_bit;
311 last_bit = next_bit; 325 last_bit = next_bit;
312 nbits = 1; 326 nbits = 1;
313 } else if (xfs_buf_offset(bp, next_bit << XFS_BLI_SHIFT) != 327 } else if (xfs_buf_offset(bp, next_bit << XFS_BLF_SHIFT) !=
314 (xfs_buf_offset(bp, last_bit << XFS_BLI_SHIFT) + 328 (xfs_buf_offset(bp, last_bit << XFS_BLF_SHIFT) +
315 XFS_BLI_CHUNK)) { 329 XFS_BLF_CHUNK)) {
316 buffer_offset = first_bit * XFS_BLI_CHUNK; 330 buffer_offset = first_bit * XFS_BLF_CHUNK;
317 vecp->i_addr = xfs_buf_offset(bp, buffer_offset); 331 vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
318 vecp->i_len = nbits * XFS_BLI_CHUNK; 332 vecp->i_len = nbits * XFS_BLF_CHUNK;
319 vecp->i_type = XLOG_REG_TYPE_BCHUNK; 333 vecp->i_type = XLOG_REG_TYPE_BCHUNK;
320/* You would think we need to bump the nvecs here too, but we do not 334/* You would think we need to bump the nvecs here too, but we do not
321 * this number is used by recovery, and it gets confused by the boundary 335 * this number is used by recovery, and it gets confused by the boundary
@@ -341,10 +355,15 @@ xfs_buf_item_format(
341} 355}
342 356
343/* 357/*
344 * This is called to pin the buffer associated with the buf log 358 * This is called to pin the buffer associated with the buf log item in memory
345 * item in memory so it cannot be written out. Simply call bpin() 359 * so it cannot be written out. Simply call bpin() on the buffer to do this.
346 * on the buffer to do this. 360 *
361 * We also always take a reference to the buffer log item here so that the bli
362 * is held while the item is pinned in memory. This means that we can
363 * unconditionally drop the reference count a transaction holds when the
364 * transaction is completed.
347 */ 365 */
366
348STATIC void 367STATIC void
349xfs_buf_item_pin( 368xfs_buf_item_pin(
350 xfs_buf_log_item_t *bip) 369 xfs_buf_log_item_t *bip)
@@ -356,6 +375,7 @@ xfs_buf_item_pin(
356 ASSERT(atomic_read(&bip->bli_refcount) > 0); 375 ASSERT(atomic_read(&bip->bli_refcount) > 0);
357 ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || 376 ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
358 (bip->bli_flags & XFS_BLI_STALE)); 377 (bip->bli_flags & XFS_BLI_STALE));
378 atomic_inc(&bip->bli_refcount);
359 trace_xfs_buf_item_pin(bip); 379 trace_xfs_buf_item_pin(bip);
360 xfs_bpin(bp); 380 xfs_bpin(bp);
361} 381}
@@ -372,12 +392,12 @@ xfs_buf_item_pin(
372 */ 392 */
373STATIC void 393STATIC void
374xfs_buf_item_unpin( 394xfs_buf_item_unpin(
375 xfs_buf_log_item_t *bip, 395 xfs_buf_log_item_t *bip)
376 int stale)
377{ 396{
378 struct xfs_ail *ailp; 397 struct xfs_ail *ailp;
379 xfs_buf_t *bp; 398 xfs_buf_t *bp;
380 int freed; 399 int freed;
400 int stale = bip->bli_flags & XFS_BLI_STALE;
381 401
382 bp = bip->bli_buf; 402 bp = bip->bli_buf;
383 ASSERT(bp != NULL); 403 ASSERT(bp != NULL);
@@ -393,7 +413,7 @@ xfs_buf_item_unpin(
393 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); 413 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
394 ASSERT(!(XFS_BUF_ISDELAYWRITE(bp))); 414 ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
395 ASSERT(XFS_BUF_ISSTALE(bp)); 415 ASSERT(XFS_BUF_ISSTALE(bp));
396 ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); 416 ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
397 trace_xfs_buf_item_unpin_stale(bip); 417 trace_xfs_buf_item_unpin_stale(bip);
398 418
399 /* 419 /*
@@ -428,40 +448,34 @@ xfs_buf_item_unpin_remove(
428 xfs_buf_log_item_t *bip, 448 xfs_buf_log_item_t *bip,
429 xfs_trans_t *tp) 449 xfs_trans_t *tp)
430{ 450{
431 xfs_buf_t *bp; 451 /* will xfs_buf_item_unpin() call xfs_buf_item_relse()? */
432 xfs_log_item_desc_t *lidp;
433 int stale = 0;
434
435 bp = bip->bli_buf;
436 /*
437 * will xfs_buf_item_unpin() call xfs_buf_item_relse()?
438 */
439 if ((atomic_read(&bip->bli_refcount) == 1) && 452 if ((atomic_read(&bip->bli_refcount) == 1) &&
440 (bip->bli_flags & XFS_BLI_STALE)) { 453 (bip->bli_flags & XFS_BLI_STALE)) {
454 /*
455 * yes -- We can safely do some work here and then call
456 * buf_item_unpin to do the rest because we are
457 * are holding the buffer locked so no one else will be
458 * able to bump up the refcount. We have to remove the
459 * log item from the transaction as we are about to release
460 * our reference to the buffer. If we don't, the unlock that
461 * occurs later in the xfs_trans_uncommit() will try to
462 * reference the buffer which we no longer have a hold on.
463 */
464 struct xfs_log_item_desc *lidp;
465
441 ASSERT(XFS_BUF_VALUSEMA(bip->bli_buf) <= 0); 466 ASSERT(XFS_BUF_VALUSEMA(bip->bli_buf) <= 0);
442 trace_xfs_buf_item_unpin_stale(bip); 467 trace_xfs_buf_item_unpin_stale(bip);
443 468
444 /* 469 lidp = xfs_trans_find_item(tp, (xfs_log_item_t *)bip);
445 * yes -- clear the xaction descriptor in-use flag
446 * and free the chunk if required. We can safely
447 * do some work here and then call buf_item_unpin
448 * to do the rest because if the if is true, then
449 * we are holding the buffer locked so no one else
450 * will be able to bump up the refcount.
451 */
452 lidp = xfs_trans_find_item(tp, (xfs_log_item_t *) bip);
453 stale = lidp->lid_flags & XFS_LID_BUF_STALE;
454 xfs_trans_free_item(tp, lidp); 470 xfs_trans_free_item(tp, lidp);
471
455 /* 472 /*
456 * Since the transaction no longer refers to the buffer, 473 * Since the transaction no longer refers to the buffer, the
457 * the buffer should no longer refer to the transaction. 474 * buffer should no longer refer to the transaction.
458 */ 475 */
459 XFS_BUF_SET_FSPRIVATE2(bp, NULL); 476 XFS_BUF_SET_FSPRIVATE2(bip->bli_buf, NULL);
460 } 477 }
461 478 xfs_buf_item_unpin(bip);
462 xfs_buf_item_unpin(bip, stale);
463
464 return;
465} 479}
466 480
467/* 481/*
@@ -495,20 +509,23 @@ xfs_buf_item_trylock(
495} 509}
496 510
497/* 511/*
498 * Release the buffer associated with the buf log item. 512 * Release the buffer associated with the buf log item. If there is no dirty
499 * If there is no dirty logged data associated with the 513 * logged data associated with the buffer recorded in the buf log item, then
500 * buffer recorded in the buf log item, then free the 514 * free the buf log item and remove the reference to it in the buffer.
501 * buf log item and remove the reference to it in the 515 *
502 * buffer. 516 * This call ignores the recursion count. It is only called when the buffer
517 * should REALLY be unlocked, regardless of the recursion count.
503 * 518 *
504 * This call ignores the recursion count. It is only called 519 * We unconditionally drop the transaction's reference to the log item. If the
505 * when the buffer should REALLY be unlocked, regardless 520 * item was logged, then another reference was taken when it was pinned, so we
506 * of the recursion count. 521 * can safely drop the transaction reference now. This also allows us to avoid
522 * potential races with the unpin code freeing the bli by not referencing the
523 * bli after we've dropped the reference count.
507 * 524 *
508 * If the XFS_BLI_HOLD flag is set in the buf log item, then 525 * If the XFS_BLI_HOLD flag is set in the buf log item, then free the log item
509 * free the log item if necessary but do not unlock the buffer. 526 * if necessary but do not unlock the buffer. This is for support of
510 * This is for support of xfs_trans_bhold(). Make sure the 527 * xfs_trans_bhold(). Make sure the XFS_BLI_HOLD field is cleared if we don't
511 * XFS_BLI_HOLD field is cleared if we don't free the item. 528 * free the item.
512 */ 529 */
513STATIC void 530STATIC void
514xfs_buf_item_unlock( 531xfs_buf_item_unlock(
@@ -520,73 +537,54 @@ xfs_buf_item_unlock(
520 537
521 bp = bip->bli_buf; 538 bp = bip->bli_buf;
522 539
523 /* 540 /* Clear the buffer's association with this transaction. */
524 * Clear the buffer's association with this transaction.
525 */
526 XFS_BUF_SET_FSPRIVATE2(bp, NULL); 541 XFS_BUF_SET_FSPRIVATE2(bp, NULL);
527 542
528 /* 543 /*
529 * If this is a transaction abort, don't return early. 544 * If this is a transaction abort, don't return early. Instead, allow
530 * Instead, allow the brelse to happen. 545 * the brelse to happen. Normally it would be done for stale
531 * Normally it would be done for stale (cancelled) buffers 546 * (cancelled) buffers at unpin time, but we'll never go through the
532 * at unpin time, but we'll never go through the pin/unpin 547 * pin/unpin cycle if we abort inside commit.
533 * cycle if we abort inside commit.
534 */ 548 */
535 aborted = (bip->bli_item.li_flags & XFS_LI_ABORTED) != 0; 549 aborted = (bip->bli_item.li_flags & XFS_LI_ABORTED) != 0;
536 550
537 /* 551 /*
538 * If the buf item is marked stale, then don't do anything. 552 * Before possibly freeing the buf item, determine if we should
539 * We'll unlock the buffer and free the buf item when the 553 * release the buffer at the end of this routine.
540 * buffer is unpinned for the last time.
541 */ 554 */
542 if (bip->bli_flags & XFS_BLI_STALE) { 555 hold = bip->bli_flags & XFS_BLI_HOLD;
543 bip->bli_flags &= ~XFS_BLI_LOGGED; 556
544 trace_xfs_buf_item_unlock_stale(bip); 557 /* Clear the per transaction state. */
545 ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); 558 bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD);
546 if (!aborted)
547 return;
548 }
549 559
550 /* 560 /*
551 * Drop the transaction's reference to the log item if 561 * If the buf item is marked stale, then don't do anything. We'll
552 * it was not logged as part of the transaction. Otherwise 562 * unlock the buffer and free the buf item when the buffer is unpinned
553 * we'll drop the reference in xfs_buf_item_unpin() when 563 * for the last time.
554 * the transaction is really through with the buffer.
555 */ 564 */
556 if (!(bip->bli_flags & XFS_BLI_LOGGED)) { 565 if (bip->bli_flags & XFS_BLI_STALE) {
557 atomic_dec(&bip->bli_refcount); 566 trace_xfs_buf_item_unlock_stale(bip);
558 } else { 567 ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
559 /* 568 if (!aborted) {
560 * Clear the logged flag since this is per 569 atomic_dec(&bip->bli_refcount);
561 * transaction state. 570 return;
562 */ 571 }
563 bip->bli_flags &= ~XFS_BLI_LOGGED;
564 } 572 }
565 573
566 /*
567 * Before possibly freeing the buf item, determine if we should
568 * release the buffer at the end of this routine.
569 */
570 hold = bip->bli_flags & XFS_BLI_HOLD;
571 trace_xfs_buf_item_unlock(bip); 574 trace_xfs_buf_item_unlock(bip);
572 575
573 /* 576 /*
574 * If the buf item isn't tracking any data, free it. 577 * If the buf item isn't tracking any data, free it, otherwise drop the
575 * Otherwise, if XFS_BLI_HOLD is set clear it. 578 * reference we hold to it.
576 */ 579 */
577 if (xfs_bitmap_empty(bip->bli_format.blf_data_map, 580 if (xfs_bitmap_empty(bip->bli_format.blf_data_map,
578 bip->bli_format.blf_map_size)) { 581 bip->bli_format.blf_map_size))
579 xfs_buf_item_relse(bp); 582 xfs_buf_item_relse(bp);
580 } else if (hold) { 583 else
581 bip->bli_flags &= ~XFS_BLI_HOLD; 584 atomic_dec(&bip->bli_refcount);
582 }
583 585
584 /* 586 if (!hold)
585 * Release the buffer if XFS_BLI_HOLD was not set.
586 */
587 if (!hold) {
588 xfs_buf_relse(bp); 587 xfs_buf_relse(bp);
589 }
590} 588}
591 589
592/* 590/*
@@ -675,7 +673,7 @@ static struct xfs_item_ops xfs_buf_item_ops = {
675 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) 673 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
676 xfs_buf_item_format, 674 xfs_buf_item_format,
677 .iop_pin = (void(*)(xfs_log_item_t*))xfs_buf_item_pin, 675 .iop_pin = (void(*)(xfs_log_item_t*))xfs_buf_item_pin,
678 .iop_unpin = (void(*)(xfs_log_item_t*, int))xfs_buf_item_unpin, 676 .iop_unpin = (void(*)(xfs_log_item_t*))xfs_buf_item_unpin,
679 .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *)) 677 .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *))
680 xfs_buf_item_unpin_remove, 678 xfs_buf_item_unpin_remove,
681 .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_buf_item_trylock, 679 .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_buf_item_trylock,
@@ -723,20 +721,17 @@ xfs_buf_item_init(
723 } 721 }
724 722
725 /* 723 /*
726 * chunks is the number of XFS_BLI_CHUNK size pieces 724 * chunks is the number of XFS_BLF_CHUNK size pieces
727 * the buffer can be divided into. Make sure not to 725 * the buffer can be divided into. Make sure not to
728 * truncate any pieces. map_size is the size of the 726 * truncate any pieces. map_size is the size of the
729 * bitmap needed to describe the chunks of the buffer. 727 * bitmap needed to describe the chunks of the buffer.
730 */ 728 */
731 chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLI_CHUNK - 1)) >> XFS_BLI_SHIFT); 729 chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLF_CHUNK - 1)) >> XFS_BLF_SHIFT);
732 map_size = (int)((chunks + NBWORD) >> BIT_TO_WORD_SHIFT); 730 map_size = (int)((chunks + NBWORD) >> BIT_TO_WORD_SHIFT);
733 731
734 bip = (xfs_buf_log_item_t*)kmem_zone_zalloc(xfs_buf_item_zone, 732 bip = (xfs_buf_log_item_t*)kmem_zone_zalloc(xfs_buf_item_zone,
735 KM_SLEEP); 733 KM_SLEEP);
736 bip->bli_item.li_type = XFS_LI_BUF; 734 xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops);
737 bip->bli_item.li_ops = &xfs_buf_item_ops;
738 bip->bli_item.li_mountp = mp;
739 bip->bli_item.li_ailp = mp->m_ail;
740 bip->bli_buf = bp; 735 bip->bli_buf = bp;
741 xfs_buf_hold(bp); 736 xfs_buf_hold(bp);
742 bip->bli_format.blf_type = XFS_LI_BUF; 737 bip->bli_format.blf_type = XFS_LI_BUF;
@@ -799,8 +794,8 @@ xfs_buf_item_log(
799 /* 794 /*
800 * Convert byte offsets to bit numbers. 795 * Convert byte offsets to bit numbers.
801 */ 796 */
802 first_bit = first >> XFS_BLI_SHIFT; 797 first_bit = first >> XFS_BLF_SHIFT;
803 last_bit = last >> XFS_BLI_SHIFT; 798 last_bit = last >> XFS_BLF_SHIFT;
804 799
805 /* 800 /*
806 * Calculate the total number of bits to be set. 801 * Calculate the total number of bits to be set.
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 217f34af00cb..f20bb472d582 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -26,7 +26,7 @@ extern kmem_zone_t *xfs_buf_item_zone;
26 * have been logged. 26 * have been logged.
27 * For 6.2 and beyond, this is XFS_LI_BUF. We use this to log everything. 27 * For 6.2 and beyond, this is XFS_LI_BUF. We use this to log everything.
28 */ 28 */
29typedef struct xfs_buf_log_format_t { 29typedef struct xfs_buf_log_format {
30 unsigned short blf_type; /* buf log item type indicator */ 30 unsigned short blf_type; /* buf log item type indicator */
31 unsigned short blf_size; /* size of this item */ 31 unsigned short blf_size; /* size of this item */
32 ushort blf_flags; /* misc state */ 32 ushort blf_flags; /* misc state */
@@ -41,22 +41,22 @@ typedef struct xfs_buf_log_format_t {
41 * This flag indicates that the buffer contains on disk inodes 41 * This flag indicates that the buffer contains on disk inodes
42 * and requires special recovery handling. 42 * and requires special recovery handling.
43 */ 43 */
44#define XFS_BLI_INODE_BUF 0x1 44#define XFS_BLF_INODE_BUF 0x1
45/* 45/*
46 * This flag indicates that the buffer should not be replayed 46 * This flag indicates that the buffer should not be replayed
47 * during recovery because its blocks are being freed. 47 * during recovery because its blocks are being freed.
48 */ 48 */
49#define XFS_BLI_CANCEL 0x2 49#define XFS_BLF_CANCEL 0x2
50/* 50/*
51 * This flag indicates that the buffer contains on disk 51 * This flag indicates that the buffer contains on disk
52 * user or group dquots and may require special recovery handling. 52 * user or group dquots and may require special recovery handling.
53 */ 53 */
54#define XFS_BLI_UDQUOT_BUF 0x4 54#define XFS_BLF_UDQUOT_BUF 0x4
55#define XFS_BLI_PDQUOT_BUF 0x8 55#define XFS_BLF_PDQUOT_BUF 0x8
56#define XFS_BLI_GDQUOT_BUF 0x10 56#define XFS_BLF_GDQUOT_BUF 0x10
57 57
58#define XFS_BLI_CHUNK 128 58#define XFS_BLF_CHUNK 128
59#define XFS_BLI_SHIFT 7 59#define XFS_BLF_SHIFT 7
60#define BIT_TO_WORD_SHIFT 5 60#define BIT_TO_WORD_SHIFT 5
61#define NBWORD (NBBY * sizeof(unsigned int)) 61#define NBWORD (NBBY * sizeof(unsigned int))
62 62
@@ -69,6 +69,7 @@ typedef struct xfs_buf_log_format_t {
69#define XFS_BLI_LOGGED 0x08 69#define XFS_BLI_LOGGED 0x08
70#define XFS_BLI_INODE_ALLOC_BUF 0x10 70#define XFS_BLI_INODE_ALLOC_BUF 0x10
71#define XFS_BLI_STALE_INODE 0x20 71#define XFS_BLI_STALE_INODE 0x20
72#define XFS_BLI_INODE_BUF 0x40
72 73
73#define XFS_BLI_FLAGS \ 74#define XFS_BLI_FLAGS \
74 { XFS_BLI_HOLD, "HOLD" }, \ 75 { XFS_BLI_HOLD, "HOLD" }, \
@@ -76,7 +77,8 @@ typedef struct xfs_buf_log_format_t {
76 { XFS_BLI_STALE, "STALE" }, \ 77 { XFS_BLI_STALE, "STALE" }, \
77 { XFS_BLI_LOGGED, "LOGGED" }, \ 78 { XFS_BLI_LOGGED, "LOGGED" }, \
78 { XFS_BLI_INODE_ALLOC_BUF, "INODE_ALLOC" }, \ 79 { XFS_BLI_INODE_ALLOC_BUF, "INODE_ALLOC" }, \
79 { XFS_BLI_STALE_INODE, "STALE_INODE" } 80 { XFS_BLI_STALE_INODE, "STALE_INODE" }, \
81 { XFS_BLI_INODE_BUF, "INODE_BUF" }
80 82
81 83
82#ifdef __KERNEL__ 84#ifdef __KERNEL__
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index cd27c9d6c71f..5bba29a07812 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -177,16 +177,26 @@ xfs_swap_extents_check_format(
177 XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > tip->i_df.if_ext_max) 177 XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > tip->i_df.if_ext_max)
178 return EINVAL; 178 return EINVAL;
179 179
180 /* Check root block of temp in btree form to max in target */ 180 /*
181 * If we are in a btree format, check that the temp root block will fit
182 * in the target and that it has enough extents to be in btree format
183 * in the target.
184 *
185 * Note that we have to be careful to allow btree->extent conversions
186 * (a common defrag case) which will occur when the temp inode is in
187 * extent format...
188 */
181 if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE && 189 if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
182 XFS_IFORK_BOFF(ip) && 190 ((XFS_IFORK_BOFF(ip) &&
183 tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip)) 191 tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip)) ||
192 XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <= ip->i_df.if_ext_max))
184 return EINVAL; 193 return EINVAL;
185 194
186 /* Check root block of target in btree form to max in temp */ 195 /* Reciprocal target->temp btree format checks */
187 if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE && 196 if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
188 XFS_IFORK_BOFF(tip) && 197 ((XFS_IFORK_BOFF(tip) &&
189 ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip)) 198 ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip)) ||
199 XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <= tip->i_df.if_ext_max))
190 return EINVAL; 200 return EINVAL;
191 201
192 return 0; 202 return 0;
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 92d5cd5bf4f2..047b8a8e5c29 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -170,7 +170,7 @@ xfs_cmn_err(int panic_tag, int level, xfs_mount_t *mp, char *fmt, ...)
170 va_list ap; 170 va_list ap;
171 171
172#ifdef DEBUG 172#ifdef DEBUG
173 xfs_panic_mask |= XFS_PTAG_SHUTDOWN_CORRUPT; 173 xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES);
174#endif 174#endif
175 175
176 if (xfs_panic_mask && (xfs_panic_mask & panic_tag) 176 if (xfs_panic_mask && (xfs_panic_mask & panic_tag)
@@ -186,18 +186,18 @@ xfs_cmn_err(int panic_tag, int level, xfs_mount_t *mp, char *fmt, ...)
186 186
187void 187void
188xfs_error_report( 188xfs_error_report(
189 char *tag, 189 const char *tag,
190 int level, 190 int level,
191 xfs_mount_t *mp, 191 struct xfs_mount *mp,
192 char *fname, 192 const char *filename,
193 int linenum, 193 int linenum,
194 inst_t *ra) 194 inst_t *ra)
195{ 195{
196 if (level <= xfs_error_level) { 196 if (level <= xfs_error_level) {
197 xfs_cmn_err(XFS_PTAG_ERROR_REPORT, 197 xfs_cmn_err(XFS_PTAG_ERROR_REPORT,
198 CE_ALERT, mp, 198 CE_ALERT, mp,
199 "XFS internal error %s at line %d of file %s. Caller 0x%p\n", 199 "XFS internal error %s at line %d of file %s. Caller 0x%p\n",
200 tag, linenum, fname, ra); 200 tag, linenum, filename, ra);
201 201
202 xfs_stack_trace(); 202 xfs_stack_trace();
203 } 203 }
@@ -205,15 +205,15 @@ xfs_error_report(
205 205
206void 206void
207xfs_corruption_error( 207xfs_corruption_error(
208 char *tag, 208 const char *tag,
209 int level, 209 int level,
210 xfs_mount_t *mp, 210 struct xfs_mount *mp,
211 void *p, 211 void *p,
212 char *fname, 212 const char *filename,
213 int linenum, 213 int linenum,
214 inst_t *ra) 214 inst_t *ra)
215{ 215{
216 if (level <= xfs_error_level) 216 if (level <= xfs_error_level)
217 xfs_hex_dump(p, 16); 217 xfs_hex_dump(p, 16);
218 xfs_error_report(tag, level, mp, fname, linenum, ra); 218 xfs_error_report(tag, level, mp, filename, linenum, ra);
219} 219}
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 0c93051c4651..c2c1a072bb82 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -29,10 +29,11 @@ extern int xfs_error_trap(int);
29 29
30struct xfs_mount; 30struct xfs_mount;
31 31
32extern void xfs_error_report(char *tag, int level, struct xfs_mount *mp, 32extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp,
33 char *fname, int linenum, inst_t *ra); 33 const char *filename, int linenum, inst_t *ra);
34extern void xfs_corruption_error(char *tag, int level, struct xfs_mount *mp, 34extern void xfs_corruption_error(const char *tag, int level,
35 void *p, char *fname, int linenum, inst_t *ra); 35 struct xfs_mount *mp, void *p, const char *filename,
36 int linenum, inst_t *ra);
36 37
37#define XFS_ERROR_REPORT(e, lvl, mp) \ 38#define XFS_ERROR_REPORT(e, lvl, mp) \
38 xfs_error_report(e, lvl, mp, __FILE__, __LINE__, __return_address) 39 xfs_error_report(e, lvl, mp, __FILE__, __LINE__, __return_address)
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 6f35ed1b39b9..409fe81585fd 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -106,7 +106,7 @@ xfs_efi_item_pin(xfs_efi_log_item_t *efip)
106 */ 106 */
107/*ARGSUSED*/ 107/*ARGSUSED*/
108STATIC void 108STATIC void
109xfs_efi_item_unpin(xfs_efi_log_item_t *efip, int stale) 109xfs_efi_item_unpin(xfs_efi_log_item_t *efip)
110{ 110{
111 struct xfs_ail *ailp = efip->efi_item.li_ailp; 111 struct xfs_ail *ailp = efip->efi_item.li_ailp;
112 112
@@ -224,7 +224,7 @@ static struct xfs_item_ops xfs_efi_item_ops = {
224 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) 224 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
225 xfs_efi_item_format, 225 xfs_efi_item_format,
226 .iop_pin = (void(*)(xfs_log_item_t*))xfs_efi_item_pin, 226 .iop_pin = (void(*)(xfs_log_item_t*))xfs_efi_item_pin,
227 .iop_unpin = (void(*)(xfs_log_item_t*, int))xfs_efi_item_unpin, 227 .iop_unpin = (void(*)(xfs_log_item_t*))xfs_efi_item_unpin,
228 .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *)) 228 .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *))
229 xfs_efi_item_unpin_remove, 229 xfs_efi_item_unpin_remove,
230 .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_efi_item_trylock, 230 .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_efi_item_trylock,
@@ -259,10 +259,7 @@ xfs_efi_init(xfs_mount_t *mp,
259 KM_SLEEP); 259 KM_SLEEP);
260 } 260 }
261 261
262 efip->efi_item.li_type = XFS_LI_EFI; 262 xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops);
263 efip->efi_item.li_ops = &xfs_efi_item_ops;
264 efip->efi_item.li_mountp = mp;
265 efip->efi_item.li_ailp = mp->m_ail;
266 efip->efi_format.efi_nextents = nextents; 263 efip->efi_format.efi_nextents = nextents;
267 efip->efi_format.efi_id = (__psint_t)(void*)efip; 264 efip->efi_format.efi_id = (__psint_t)(void*)efip;
268 265
@@ -428,7 +425,7 @@ xfs_efd_item_pin(xfs_efd_log_item_t *efdp)
428 */ 425 */
429/*ARGSUSED*/ 426/*ARGSUSED*/
430STATIC void 427STATIC void
431xfs_efd_item_unpin(xfs_efd_log_item_t *efdp, int stale) 428xfs_efd_item_unpin(xfs_efd_log_item_t *efdp)
432{ 429{
433 return; 430 return;
434} 431}
@@ -518,7 +515,7 @@ static struct xfs_item_ops xfs_efd_item_ops = {
518 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) 515 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
519 xfs_efd_item_format, 516 xfs_efd_item_format,
520 .iop_pin = (void(*)(xfs_log_item_t*))xfs_efd_item_pin, 517 .iop_pin = (void(*)(xfs_log_item_t*))xfs_efd_item_pin,
521 .iop_unpin = (void(*)(xfs_log_item_t*, int))xfs_efd_item_unpin, 518 .iop_unpin = (void(*)(xfs_log_item_t*))xfs_efd_item_unpin,
522 .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*)) 519 .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*))
523 xfs_efd_item_unpin_remove, 520 xfs_efd_item_unpin_remove,
524 .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_efd_item_trylock, 521 .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_efd_item_trylock,
@@ -554,10 +551,7 @@ xfs_efd_init(xfs_mount_t *mp,
554 KM_SLEEP); 551 KM_SLEEP);
555 } 552 }
556 553
557 efdp->efd_item.li_type = XFS_LI_EFD; 554 xfs_log_item_init(mp, &efdp->efd_item, XFS_LI_EFD, &xfs_efd_item_ops);
558 efdp->efd_item.li_ops = &xfs_efd_item_ops;
559 efdp->efd_item.li_mountp = mp;
560 efdp->efd_item.li_ailp = mp->m_ail;
561 efdp->efd_efip = efip; 555 efdp->efd_efip = efip;
562 efdp->efd_format.efd_nextents = nextents; 556 efdp->efd_format.efd_nextents = nextents;
563 efdp->efd_format.efd_efi_id = efip->efi_format.efi_id; 557 efdp->efd_format.efd_efi_id = efip->efi_format.efi_id;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 0ffd56447045..8cd6e8d8fe9c 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2449,6 +2449,8 @@ xfs_iunpin_nowait(
2449{ 2449{
2450 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); 2450 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
2451 2451
2452 trace_xfs_inode_unpin_nowait(ip, _RET_IP_);
2453
2452 /* Give the log a push to start the unpinning I/O */ 2454 /* Give the log a push to start the unpinning I/O */
2453 xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0); 2455 xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0);
2454 2456
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 7bfea8540159..cf8249a60004 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -543,6 +543,7 @@ xfs_inode_item_pin(
543{ 543{
544 ASSERT(xfs_isilocked(iip->ili_inode, XFS_ILOCK_EXCL)); 544 ASSERT(xfs_isilocked(iip->ili_inode, XFS_ILOCK_EXCL));
545 545
546 trace_xfs_inode_pin(iip->ili_inode, _RET_IP_);
546 atomic_inc(&iip->ili_inode->i_pincount); 547 atomic_inc(&iip->ili_inode->i_pincount);
547} 548}
548 549
@@ -556,11 +557,11 @@ xfs_inode_item_pin(
556/* ARGSUSED */ 557/* ARGSUSED */
557STATIC void 558STATIC void
558xfs_inode_item_unpin( 559xfs_inode_item_unpin(
559 xfs_inode_log_item_t *iip, 560 xfs_inode_log_item_t *iip)
560 int stale)
561{ 561{
562 struct xfs_inode *ip = iip->ili_inode; 562 struct xfs_inode *ip = iip->ili_inode;
563 563
564 trace_xfs_inode_unpin(ip, _RET_IP_);
564 ASSERT(atomic_read(&ip->i_pincount) > 0); 565 ASSERT(atomic_read(&ip->i_pincount) > 0);
565 if (atomic_dec_and_test(&ip->i_pincount)) 566 if (atomic_dec_and_test(&ip->i_pincount))
566 wake_up(&ip->i_ipin_wait); 567 wake_up(&ip->i_ipin_wait);
@@ -572,7 +573,7 @@ xfs_inode_item_unpin_remove(
572 xfs_inode_log_item_t *iip, 573 xfs_inode_log_item_t *iip,
573 xfs_trans_t *tp) 574 xfs_trans_t *tp)
574{ 575{
575 xfs_inode_item_unpin(iip, 0); 576 xfs_inode_item_unpin(iip);
576} 577}
577 578
578/* 579/*
@@ -838,7 +839,7 @@ static struct xfs_item_ops xfs_inode_item_ops = {
838 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) 839 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
839 xfs_inode_item_format, 840 xfs_inode_item_format,
840 .iop_pin = (void(*)(xfs_log_item_t*))xfs_inode_item_pin, 841 .iop_pin = (void(*)(xfs_log_item_t*))xfs_inode_item_pin,
841 .iop_unpin = (void(*)(xfs_log_item_t*, int))xfs_inode_item_unpin, 842 .iop_unpin = (void(*)(xfs_log_item_t*))xfs_inode_item_unpin,
842 .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*)) 843 .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*))
843 xfs_inode_item_unpin_remove, 844 xfs_inode_item_unpin_remove,
844 .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_inode_item_trylock, 845 .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_inode_item_trylock,
@@ -865,17 +866,9 @@ xfs_inode_item_init(
865 ASSERT(ip->i_itemp == NULL); 866 ASSERT(ip->i_itemp == NULL);
866 iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, KM_SLEEP); 867 iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, KM_SLEEP);
867 868
868 iip->ili_item.li_type = XFS_LI_INODE;
869 iip->ili_item.li_ops = &xfs_inode_item_ops;
870 iip->ili_item.li_mountp = mp;
871 iip->ili_item.li_ailp = mp->m_ail;
872 iip->ili_inode = ip; 869 iip->ili_inode = ip;
873 870 xfs_log_item_init(mp, &iip->ili_item, XFS_LI_INODE,
874 /* 871 &xfs_inode_item_ops);
875 We have zeroed memory. No need ...
876 iip->ili_extents_buf = NULL;
877 */
878
879 iip->ili_format.ilf_type = XFS_LI_INODE; 872 iip->ili_format.ilf_type = XFS_LI_INODE;
880 iip->ili_format.ilf_ino = ip->i_ino; 873 iip->ili_format.ilf_ino = ip->i_ino;
881 iip->ili_format.ilf_blkno = ip->i_imap.im_blkno; 874 iip->ili_format.ilf_blkno = ip->i_imap.im_blkno;
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 0b65039951a0..ef14943829da 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -55,71 +55,33 @@
55#define XFS_STRAT_WRITE_IMAPS 2 55#define XFS_STRAT_WRITE_IMAPS 2
56#define XFS_WRITE_IMAPS XFS_BMAP_MAX_NMAP 56#define XFS_WRITE_IMAPS XFS_BMAP_MAX_NMAP
57 57
58STATIC int 58STATIC int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
59xfs_imap_to_bmap( 59 int, struct xfs_bmbt_irec *, int *);
60 xfs_inode_t *ip, 60STATIC int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, int,
61 xfs_off_t offset, 61 struct xfs_bmbt_irec *, int *);
62 xfs_bmbt_irec_t *imap, 62STATIC int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t,
63 xfs_iomap_t *iomapp, 63 struct xfs_bmbt_irec *, int *);
64 int imaps, /* Number of imap entries */
65 int iomaps, /* Number of iomap entries */
66 int flags)
67{
68 xfs_mount_t *mp = ip->i_mount;
69 int pbm;
70 xfs_fsblock_t start_block;
71
72
73 for (pbm = 0; imaps && pbm < iomaps; imaps--, iomapp++, imap++, pbm++) {
74 iomapp->iomap_offset = XFS_FSB_TO_B(mp, imap->br_startoff);
75 iomapp->iomap_delta = offset - iomapp->iomap_offset;
76 iomapp->iomap_bsize = XFS_FSB_TO_B(mp, imap->br_blockcount);
77 iomapp->iomap_flags = flags;
78
79 if (XFS_IS_REALTIME_INODE(ip)) {
80 iomapp->iomap_flags |= IOMAP_REALTIME;
81 iomapp->iomap_target = mp->m_rtdev_targp;
82 } else {
83 iomapp->iomap_target = mp->m_ddev_targp;
84 }
85 start_block = imap->br_startblock;
86 if (start_block == HOLESTARTBLOCK) {
87 iomapp->iomap_bn = IOMAP_DADDR_NULL;
88 iomapp->iomap_flags |= IOMAP_HOLE;
89 } else if (start_block == DELAYSTARTBLOCK) {
90 iomapp->iomap_bn = IOMAP_DADDR_NULL;
91 iomapp->iomap_flags |= IOMAP_DELAY;
92 } else {
93 iomapp->iomap_bn = xfs_fsb_to_db(ip, start_block);
94 if (ISUNWRITTEN(imap))
95 iomapp->iomap_flags |= IOMAP_UNWRITTEN;
96 }
97
98 offset += iomapp->iomap_bsize - iomapp->iomap_delta;
99 }
100 return pbm; /* Return the number filled */
101}
102 64
103int 65int
104xfs_iomap( 66xfs_iomap(
105 xfs_inode_t *ip, 67 struct xfs_inode *ip,
106 xfs_off_t offset, 68 xfs_off_t offset,
107 ssize_t count, 69 ssize_t count,
108 int flags, 70 int flags,
109 xfs_iomap_t *iomapp, 71 struct xfs_bmbt_irec *imap,
110 int *niomaps) 72 int *nimaps,
73 int *new)
111{ 74{
112 xfs_mount_t *mp = ip->i_mount; 75 struct xfs_mount *mp = ip->i_mount;
113 xfs_fileoff_t offset_fsb, end_fsb; 76 xfs_fileoff_t offset_fsb, end_fsb;
114 int error = 0; 77 int error = 0;
115 int lockmode = 0; 78 int lockmode = 0;
116 xfs_bmbt_irec_t imap; 79 int bmapi_flags = 0;
117 int nimaps = 1;
118 int bmapi_flags = 0;
119 int iomap_flags = 0;
120 80
121 ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG); 81 ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
122 82
83 *new = 0;
84
123 if (XFS_FORCED_SHUTDOWN(mp)) 85 if (XFS_FORCED_SHUTDOWN(mp))
124 return XFS_ERROR(EIO); 86 return XFS_ERROR(EIO);
125 87
@@ -160,8 +122,8 @@ xfs_iomap(
160 122
161 error = xfs_bmapi(NULL, ip, offset_fsb, 123 error = xfs_bmapi(NULL, ip, offset_fsb,
162 (xfs_filblks_t)(end_fsb - offset_fsb), 124 (xfs_filblks_t)(end_fsb - offset_fsb),
163 bmapi_flags, NULL, 0, &imap, 125 bmapi_flags, NULL, 0, imap,
164 &nimaps, NULL, NULL); 126 nimaps, NULL, NULL);
165 127
166 if (error) 128 if (error)
167 goto out; 129 goto out;
@@ -169,46 +131,41 @@ xfs_iomap(
169 switch (flags & (BMAPI_WRITE|BMAPI_ALLOCATE)) { 131 switch (flags & (BMAPI_WRITE|BMAPI_ALLOCATE)) {
170 case BMAPI_WRITE: 132 case BMAPI_WRITE:
171 /* If we found an extent, return it */ 133 /* If we found an extent, return it */
172 if (nimaps && 134 if (*nimaps &&
173 (imap.br_startblock != HOLESTARTBLOCK) && 135 (imap->br_startblock != HOLESTARTBLOCK) &&
174 (imap.br_startblock != DELAYSTARTBLOCK)) { 136 (imap->br_startblock != DELAYSTARTBLOCK)) {
175 trace_xfs_iomap_found(ip, offset, count, flags, &imap); 137 trace_xfs_iomap_found(ip, offset, count, flags, imap);
176 break; 138 break;
177 } 139 }
178 140
179 if (flags & (BMAPI_DIRECT|BMAPI_MMAP)) { 141 if (flags & (BMAPI_DIRECT|BMAPI_MMAP)) {
180 error = xfs_iomap_write_direct(ip, offset, count, flags, 142 error = xfs_iomap_write_direct(ip, offset, count, flags,
181 &imap, &nimaps, nimaps); 143 imap, nimaps);
182 } else { 144 } else {
183 error = xfs_iomap_write_delay(ip, offset, count, flags, 145 error = xfs_iomap_write_delay(ip, offset, count, flags,
184 &imap, &nimaps); 146 imap, nimaps);
185 } 147 }
186 if (!error) { 148 if (!error) {
187 trace_xfs_iomap_alloc(ip, offset, count, flags, &imap); 149 trace_xfs_iomap_alloc(ip, offset, count, flags, imap);
188 } 150 }
189 iomap_flags = IOMAP_NEW; 151 *new = 1;
190 break; 152 break;
191 case BMAPI_ALLOCATE: 153 case BMAPI_ALLOCATE:
192 /* If we found an extent, return it */ 154 /* If we found an extent, return it */
193 xfs_iunlock(ip, lockmode); 155 xfs_iunlock(ip, lockmode);
194 lockmode = 0; 156 lockmode = 0;
195 157
196 if (nimaps && !isnullstartblock(imap.br_startblock)) { 158 if (*nimaps && !isnullstartblock(imap->br_startblock)) {
197 trace_xfs_iomap_found(ip, offset, count, flags, &imap); 159 trace_xfs_iomap_found(ip, offset, count, flags, imap);
198 break; 160 break;
199 } 161 }
200 162
201 error = xfs_iomap_write_allocate(ip, offset, count, 163 error = xfs_iomap_write_allocate(ip, offset, count,
202 &imap, &nimaps); 164 imap, nimaps);
203 break; 165 break;
204 } 166 }
205 167
206 if (nimaps) { 168 ASSERT(*nimaps <= 1);
207 *niomaps = xfs_imap_to_bmap(ip, offset, &imap,
208 iomapp, nimaps, *niomaps, iomap_flags);
209 } else if (niomaps) {
210 *niomaps = 0;
211 }
212 169
213out: 170out:
214 if (lockmode) 171 if (lockmode)
@@ -216,7 +173,6 @@ out:
216 return XFS_ERROR(error); 173 return XFS_ERROR(error);
217} 174}
218 175
219
220STATIC int 176STATIC int
221xfs_iomap_eof_align_last_fsb( 177xfs_iomap_eof_align_last_fsb(
222 xfs_mount_t *mp, 178 xfs_mount_t *mp,
@@ -285,15 +241,14 @@ xfs_cmn_err_fsblock_zero(
285 return EFSCORRUPTED; 241 return EFSCORRUPTED;
286} 242}
287 243
288int 244STATIC int
289xfs_iomap_write_direct( 245xfs_iomap_write_direct(
290 xfs_inode_t *ip, 246 xfs_inode_t *ip,
291 xfs_off_t offset, 247 xfs_off_t offset,
292 size_t count, 248 size_t count,
293 int flags, 249 int flags,
294 xfs_bmbt_irec_t *ret_imap, 250 xfs_bmbt_irec_t *ret_imap,
295 int *nmaps, 251 int *nmaps)
296 int found)
297{ 252{
298 xfs_mount_t *mp = ip->i_mount; 253 xfs_mount_t *mp = ip->i_mount;
299 xfs_fileoff_t offset_fsb; 254 xfs_fileoff_t offset_fsb;
@@ -330,7 +285,7 @@ xfs_iomap_write_direct(
330 if (error) 285 if (error)
331 goto error_out; 286 goto error_out;
332 } else { 287 } else {
333 if (found && (ret_imap->br_startblock == HOLESTARTBLOCK)) 288 if (*nmaps && (ret_imap->br_startblock == HOLESTARTBLOCK))
334 last_fsb = MIN(last_fsb, (xfs_fileoff_t) 289 last_fsb = MIN(last_fsb, (xfs_fileoff_t)
335 ret_imap->br_blockcount + 290 ret_imap->br_blockcount +
336 ret_imap->br_startoff); 291 ret_imap->br_startoff);
@@ -485,7 +440,7 @@ xfs_iomap_eof_want_preallocate(
485 return 0; 440 return 0;
486} 441}
487 442
488int 443STATIC int
489xfs_iomap_write_delay( 444xfs_iomap_write_delay(
490 xfs_inode_t *ip, 445 xfs_inode_t *ip,
491 xfs_off_t offset, 446 xfs_off_t offset,
@@ -588,7 +543,7 @@ retry:
588 * We no longer bother to look at the incoming map - all we have to 543 * We no longer bother to look at the incoming map - all we have to
589 * guarantee is that whatever we allocate fills the required range. 544 * guarantee is that whatever we allocate fills the required range.
590 */ 545 */
591int 546STATIC int
592xfs_iomap_write_allocate( 547xfs_iomap_write_allocate(
593 xfs_inode_t *ip, 548 xfs_inode_t *ip,
594 xfs_off_t offset, 549 xfs_off_t offset,
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 174f29990991..81ac4afd45b3 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -18,19 +18,6 @@
18#ifndef __XFS_IOMAP_H__ 18#ifndef __XFS_IOMAP_H__
19#define __XFS_IOMAP_H__ 19#define __XFS_IOMAP_H__
20 20
21#define IOMAP_DADDR_NULL ((xfs_daddr_t) (-1LL))
22
23
24typedef enum { /* iomap_flags values */
25 IOMAP_READ = 0, /* mapping for a read */
26 IOMAP_HOLE = 0x02, /* mapping covers a hole */
27 IOMAP_DELAY = 0x04, /* mapping covers delalloc region */
28 IOMAP_REALTIME = 0x10, /* mapping on the realtime device */
29 IOMAP_UNWRITTEN = 0x20, /* mapping covers allocated */
30 /* but uninitialized file data */
31 IOMAP_NEW = 0x40 /* just allocate */
32} iomap_flags_t;
33
34typedef enum { 21typedef enum {
35 /* base extent manipulation calls */ 22 /* base extent manipulation calls */
36 BMAPI_READ = (1 << 0), /* read extents */ 23 BMAPI_READ = (1 << 0), /* read extents */
@@ -52,43 +39,11 @@ typedef enum {
52 { BMAPI_MMAP, "MMAP" }, \ 39 { BMAPI_MMAP, "MMAP" }, \
53 { BMAPI_TRYLOCK, "TRYLOCK" } 40 { BMAPI_TRYLOCK, "TRYLOCK" }
54 41
55/*
56 * xfs_iomap_t: File system I/O map
57 *
58 * The iomap_bn field is expressed in 512-byte blocks, and is where the
59 * mapping starts on disk.
60 *
61 * The iomap_offset, iomap_bsize and iomap_delta fields are in bytes.
62 * iomap_offset is the offset of the mapping in the file itself.
63 * iomap_bsize is the size of the mapping, iomap_delta is the
64 * desired data's offset into the mapping, given the offset supplied
65 * to the file I/O map routine.
66 *
67 * When a request is made to read beyond the logical end of the object,
68 * iomap_size may be set to 0, but iomap_offset and iomap_length should be set
69 * to the actual amount of underlying storage that has been allocated, if any.
70 */
71
72typedef struct xfs_iomap {
73 xfs_daddr_t iomap_bn; /* first 512B blk of mapping */
74 xfs_buftarg_t *iomap_target;
75 xfs_off_t iomap_offset; /* offset of mapping, bytes */
76 xfs_off_t iomap_bsize; /* size of mapping, bytes */
77 xfs_off_t iomap_delta; /* offset into mapping, bytes */
78 iomap_flags_t iomap_flags;
79} xfs_iomap_t;
80
81struct xfs_inode; 42struct xfs_inode;
82struct xfs_bmbt_irec; 43struct xfs_bmbt_irec;
83 44
84extern int xfs_iomap(struct xfs_inode *, xfs_off_t, ssize_t, int, 45extern int xfs_iomap(struct xfs_inode *, xfs_off_t, ssize_t, int,
85 struct xfs_iomap *, int *); 46 struct xfs_bmbt_irec *, int *, int *);
86extern int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
87 int, struct xfs_bmbt_irec *, int *, int);
88extern int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, int,
89 struct xfs_bmbt_irec *, int *);
90extern int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t,
91 struct xfs_bmbt_irec *, int *);
92extern int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t); 47extern int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t);
93 48
94#endif /* __XFS_IOMAP_H__*/ 49#endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index e8fba92d7cd9..5215abc8023a 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -44,13 +44,8 @@
44 44
45kmem_zone_t *xfs_log_ticket_zone; 45kmem_zone_t *xfs_log_ticket_zone;
46 46
47#define xlog_write_adv_cnt(ptr, len, off, bytes) \
48 { (ptr) += (bytes); \
49 (len) -= (bytes); \
50 (off) += (bytes);}
51
52/* Local miscellaneous function prototypes */ 47/* Local miscellaneous function prototypes */
53STATIC int xlog_commit_record(xfs_mount_t *mp, xlog_ticket_t *ticket, 48STATIC int xlog_commit_record(struct log *log, struct xlog_ticket *ticket,
54 xlog_in_core_t **, xfs_lsn_t *); 49 xlog_in_core_t **, xfs_lsn_t *);
55STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp, 50STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp,
56 xfs_buftarg_t *log_target, 51 xfs_buftarg_t *log_target,
@@ -59,11 +54,6 @@ STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp,
59STATIC int xlog_space_left(xlog_t *log, int cycle, int bytes); 54STATIC int xlog_space_left(xlog_t *log, int cycle, int bytes);
60STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog); 55STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog);
61STATIC void xlog_dealloc_log(xlog_t *log); 56STATIC void xlog_dealloc_log(xlog_t *log);
62STATIC int xlog_write(xfs_mount_t *mp, xfs_log_iovec_t region[],
63 int nentries, struct xlog_ticket *tic,
64 xfs_lsn_t *start_lsn,
65 xlog_in_core_t **commit_iclog,
66 uint flags);
67 57
68/* local state machine functions */ 58/* local state machine functions */
69STATIC void xlog_state_done_syncing(xlog_in_core_t *iclog, int); 59STATIC void xlog_state_done_syncing(xlog_in_core_t *iclog, int);
@@ -93,16 +83,8 @@ STATIC int xlog_regrant_write_log_space(xlog_t *log,
93STATIC void xlog_ungrant_log_space(xlog_t *log, 83STATIC void xlog_ungrant_log_space(xlog_t *log,
94 xlog_ticket_t *ticket); 84 xlog_ticket_t *ticket);
95 85
96
97/* local ticket functions */
98STATIC xlog_ticket_t *xlog_ticket_alloc(xlog_t *log,
99 int unit_bytes,
100 int count,
101 char clientid,
102 uint flags);
103
104#if defined(DEBUG) 86#if defined(DEBUG)
105STATIC void xlog_verify_dest_ptr(xlog_t *log, __psint_t ptr); 87STATIC void xlog_verify_dest_ptr(xlog_t *log, char *ptr);
106STATIC void xlog_verify_grant_head(xlog_t *log, int equals); 88STATIC void xlog_verify_grant_head(xlog_t *log, int equals);
107STATIC void xlog_verify_iclog(xlog_t *log, xlog_in_core_t *iclog, 89STATIC void xlog_verify_iclog(xlog_t *log, xlog_in_core_t *iclog,
108 int count, boolean_t syncing); 90 int count, boolean_t syncing);
@@ -258,7 +240,7 @@ xfs_log_done(
258 * If we get an error, just continue and give back the log ticket. 240 * If we get an error, just continue and give back the log ticket.
259 */ 241 */
260 (((ticket->t_flags & XLOG_TIC_INITED) == 0) && 242 (((ticket->t_flags & XLOG_TIC_INITED) == 0) &&
261 (xlog_commit_record(mp, ticket, iclog, &lsn)))) { 243 (xlog_commit_record(log, ticket, iclog, &lsn)))) {
262 lsn = (xfs_lsn_t) -1; 244 lsn = (xfs_lsn_t) -1;
263 if (ticket->t_flags & XLOG_TIC_PERM_RESERV) { 245 if (ticket->t_flags & XLOG_TIC_PERM_RESERV) {
264 flags |= XFS_LOG_REL_PERM_RESERV; 246 flags |= XFS_LOG_REL_PERM_RESERV;
@@ -367,6 +349,15 @@ xfs_log_reserve(
367 ASSERT(flags & XFS_LOG_PERM_RESERV); 349 ASSERT(flags & XFS_LOG_PERM_RESERV);
368 internal_ticket = *ticket; 350 internal_ticket = *ticket;
369 351
352 /*
353 * this is a new transaction on the ticket, so we need to
354 * change the transaction ID so that the next transaction has a
355 * different TID in the log. Just add one to the existing tid
356 * so that we can see chains of rolling transactions in the log
357 * easily.
358 */
359 internal_ticket->t_tid++;
360
370 trace_xfs_log_reserve(log, internal_ticket); 361 trace_xfs_log_reserve(log, internal_ticket);
371 362
372 xlog_grant_push_ail(mp, internal_ticket->t_unit_res); 363 xlog_grant_push_ail(mp, internal_ticket->t_unit_res);
@@ -374,7 +365,8 @@ xfs_log_reserve(
374 } else { 365 } else {
375 /* may sleep if need to allocate more tickets */ 366 /* may sleep if need to allocate more tickets */
376 internal_ticket = xlog_ticket_alloc(log, unit_bytes, cnt, 367 internal_ticket = xlog_ticket_alloc(log, unit_bytes, cnt,
377 client, flags); 368 client, flags,
369 KM_SLEEP|KM_MAYFAIL);
378 if (!internal_ticket) 370 if (!internal_ticket)
379 return XFS_ERROR(ENOMEM); 371 return XFS_ERROR(ENOMEM);
380 internal_ticket->t_trans_type = t_type; 372 internal_ticket->t_trans_type = t_type;
@@ -459,6 +451,13 @@ xfs_log_mount(
459 /* Normal transactions can now occur */ 451 /* Normal transactions can now occur */
460 mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY; 452 mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
461 453
454 /*
455 * Now the log has been fully initialised and we know were our
456 * space grant counters are, we can initialise the permanent ticket
457 * needed for delayed logging to work.
458 */
459 xlog_cil_init_post_recovery(mp->m_log);
460
462 return 0; 461 return 0;
463 462
464out_destroy_ail: 463out_destroy_ail:
@@ -516,18 +515,10 @@ xfs_log_unmount_write(xfs_mount_t *mp)
516#ifdef DEBUG 515#ifdef DEBUG
517 xlog_in_core_t *first_iclog; 516 xlog_in_core_t *first_iclog;
518#endif 517#endif
519 xfs_log_iovec_t reg[1];
520 xlog_ticket_t *tic = NULL; 518 xlog_ticket_t *tic = NULL;
521 xfs_lsn_t lsn; 519 xfs_lsn_t lsn;
522 int error; 520 int error;
523 521
524 /* the data section must be 32 bit size aligned */
525 struct {
526 __uint16_t magic;
527 __uint16_t pad1;
528 __uint32_t pad2; /* may as well make it 64 bits */
529 } magic = { XLOG_UNMOUNT_TYPE, 0, 0 };
530
531 /* 522 /*
532 * Don't write out unmount record on read-only mounts. 523 * Don't write out unmount record on read-only mounts.
533 * Or, if we are doing a forced umount (typically because of IO errors). 524 * Or, if we are doing a forced umount (typically because of IO errors).
@@ -549,16 +540,30 @@ xfs_log_unmount_write(xfs_mount_t *mp)
549 } while (iclog != first_iclog); 540 } while (iclog != first_iclog);
550#endif 541#endif
551 if (! (XLOG_FORCED_SHUTDOWN(log))) { 542 if (! (XLOG_FORCED_SHUTDOWN(log))) {
552 reg[0].i_addr = (void*)&magic;
553 reg[0].i_len = sizeof(magic);
554 reg[0].i_type = XLOG_REG_TYPE_UNMOUNT;
555
556 error = xfs_log_reserve(mp, 600, 1, &tic, 543 error = xfs_log_reserve(mp, 600, 1, &tic,
557 XFS_LOG, 0, XLOG_UNMOUNT_REC_TYPE); 544 XFS_LOG, 0, XLOG_UNMOUNT_REC_TYPE);
558 if (!error) { 545 if (!error) {
546 /* the data section must be 32 bit size aligned */
547 struct {
548 __uint16_t magic;
549 __uint16_t pad1;
550 __uint32_t pad2; /* may as well make it 64 bits */
551 } magic = {
552 .magic = XLOG_UNMOUNT_TYPE,
553 };
554 struct xfs_log_iovec reg = {
555 .i_addr = (void *)&magic,
556 .i_len = sizeof(magic),
557 .i_type = XLOG_REG_TYPE_UNMOUNT,
558 };
559 struct xfs_log_vec vec = {
560 .lv_niovecs = 1,
561 .lv_iovecp = &reg,
562 };
563
559 /* remove inited flag */ 564 /* remove inited flag */
560 ((xlog_ticket_t *)tic)->t_flags = 0; 565 tic->t_flags = 0;
561 error = xlog_write(mp, reg, 1, tic, &lsn, 566 error = xlog_write(log, &vec, tic, &lsn,
562 NULL, XLOG_UNMOUNT_TRANS); 567 NULL, XLOG_UNMOUNT_TRANS);
563 /* 568 /*
564 * At this point, we're umounting anyway, 569 * At this point, we're umounting anyway,
@@ -648,10 +653,30 @@ xfs_log_unmount(xfs_mount_t *mp)
648 xlog_dealloc_log(mp->m_log); 653 xlog_dealloc_log(mp->m_log);
649} 654}
650 655
656void
657xfs_log_item_init(
658 struct xfs_mount *mp,
659 struct xfs_log_item *item,
660 int type,
661 struct xfs_item_ops *ops)
662{
663 item->li_mountp = mp;
664 item->li_ailp = mp->m_ail;
665 item->li_type = type;
666 item->li_ops = ops;
667 item->li_lv = NULL;
668
669 INIT_LIST_HEAD(&item->li_ail);
670 INIT_LIST_HEAD(&item->li_cil);
671}
672
651/* 673/*
652 * Write region vectors to log. The write happens using the space reservation 674 * Write region vectors to log. The write happens using the space reservation
653 * of the ticket (tic). It is not a requirement that all writes for a given 675 * of the ticket (tic). It is not a requirement that all writes for a given
654 * transaction occur with one call to xfs_log_write(). 676 * transaction occur with one call to xfs_log_write(). However, it is important
677 * to note that the transaction reservation code makes an assumption about the
678 * number of log headers a transaction requires that may be violated if you
679 * don't pass all the transaction vectors in one call....
655 */ 680 */
656int 681int
657xfs_log_write( 682xfs_log_write(
@@ -663,11 +688,15 @@ xfs_log_write(
663{ 688{
664 struct log *log = mp->m_log; 689 struct log *log = mp->m_log;
665 int error; 690 int error;
691 struct xfs_log_vec vec = {
692 .lv_niovecs = nentries,
693 .lv_iovecp = reg,
694 };
666 695
667 if (XLOG_FORCED_SHUTDOWN(log)) 696 if (XLOG_FORCED_SHUTDOWN(log))
668 return XFS_ERROR(EIO); 697 return XFS_ERROR(EIO);
669 698
670 error = xlog_write(mp, reg, nentries, tic, start_lsn, NULL, 0); 699 error = xlog_write(log, &vec, tic, start_lsn, NULL, 0);
671 if (error) 700 if (error)
672 xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); 701 xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
673 return error; 702 return error;
@@ -745,9 +774,16 @@ xfs_log_move_tail(xfs_mount_t *mp,
745 774
746/* 775/*
747 * Determine if we have a transaction that has gone to disk 776 * Determine if we have a transaction that has gone to disk
748 * that needs to be covered. Log activity needs to be idle (no AIL and 777 * that needs to be covered. To begin the transition to the idle state
749 * nothing in the iclogs). And, we need to be in the right state indicating 778 * firstly the log needs to be idle (no AIL and nothing in the iclogs).
750 * something has gone out. 779 * If we are then in a state where covering is needed, the caller is informed
780 * that dummy transactions are required to move the log into the idle state.
781 *
782 * Because this is called as part of the sync process, we should also indicate
783 * that dummy transactions should be issued in anything but the covered or
784 * idle states. This ensures that the log tail is accurately reflected in
785 * the log at the end of the sync, hence if a crash occurrs avoids replay
786 * of transactions where the metadata is already on disk.
751 */ 787 */
752int 788int
753xfs_log_need_covered(xfs_mount_t *mp) 789xfs_log_need_covered(xfs_mount_t *mp)
@@ -759,17 +795,24 @@ xfs_log_need_covered(xfs_mount_t *mp)
759 return 0; 795 return 0;
760 796
761 spin_lock(&log->l_icloglock); 797 spin_lock(&log->l_icloglock);
762 if (((log->l_covered_state == XLOG_STATE_COVER_NEED) || 798 switch (log->l_covered_state) {
763 (log->l_covered_state == XLOG_STATE_COVER_NEED2)) 799 case XLOG_STATE_COVER_DONE:
764 && !xfs_trans_ail_tail(log->l_ailp) 800 case XLOG_STATE_COVER_DONE2:
765 && xlog_iclogs_empty(log)) { 801 case XLOG_STATE_COVER_IDLE:
766 if (log->l_covered_state == XLOG_STATE_COVER_NEED) 802 break;
767 log->l_covered_state = XLOG_STATE_COVER_DONE; 803 case XLOG_STATE_COVER_NEED:
768 else { 804 case XLOG_STATE_COVER_NEED2:
769 ASSERT(log->l_covered_state == XLOG_STATE_COVER_NEED2); 805 if (!xfs_trans_ail_tail(log->l_ailp) &&
770 log->l_covered_state = XLOG_STATE_COVER_DONE2; 806 xlog_iclogs_empty(log)) {
807 if (log->l_covered_state == XLOG_STATE_COVER_NEED)
808 log->l_covered_state = XLOG_STATE_COVER_DONE;
809 else
810 log->l_covered_state = XLOG_STATE_COVER_DONE2;
771 } 811 }
812 /* FALLTHRU */
813 default:
772 needed = 1; 814 needed = 1;
815 break;
773 } 816 }
774 spin_unlock(&log->l_icloglock); 817 spin_unlock(&log->l_icloglock);
775 return needed; 818 return needed;
@@ -1006,6 +1049,7 @@ xlog_alloc_log(xfs_mount_t *mp,
1006 int i; 1049 int i;
1007 int iclogsize; 1050 int iclogsize;
1008 int error = ENOMEM; 1051 int error = ENOMEM;
1052 uint log2_size = 0;
1009 1053
1010 log = kmem_zalloc(sizeof(xlog_t), KM_MAYFAIL); 1054 log = kmem_zalloc(sizeof(xlog_t), KM_MAYFAIL);
1011 if (!log) { 1055 if (!log) {
@@ -1031,29 +1075,30 @@ xlog_alloc_log(xfs_mount_t *mp,
1031 1075
1032 error = EFSCORRUPTED; 1076 error = EFSCORRUPTED;
1033 if (xfs_sb_version_hassector(&mp->m_sb)) { 1077 if (xfs_sb_version_hassector(&mp->m_sb)) {
1034 log->l_sectbb_log = mp->m_sb.sb_logsectlog - BBSHIFT; 1078 log2_size = mp->m_sb.sb_logsectlog;
1035 if (log->l_sectbb_log < 0 || 1079 if (log2_size < BBSHIFT) {
1036 log->l_sectbb_log > mp->m_sectbb_log) { 1080 xlog_warn("XFS: Log sector size too small "
1037 xlog_warn("XFS: Log sector size (0x%x) out of range.", 1081 "(0x%x < 0x%x)", log2_size, BBSHIFT);
1038 log->l_sectbb_log);
1039 goto out_free_log; 1082 goto out_free_log;
1040 } 1083 }
1041 1084
1042 /* for larger sector sizes, must have v2 or external log */ 1085 log2_size -= BBSHIFT;
1043 if (log->l_sectbb_log != 0 && 1086 if (log2_size > mp->m_sectbb_log) {
1044 (log->l_logBBstart != 0 && 1087 xlog_warn("XFS: Log sector size too large "
1045 !xfs_sb_version_haslogv2(&mp->m_sb))) { 1088 "(0x%x > 0x%x)", log2_size, mp->m_sectbb_log);
1046 xlog_warn("XFS: log sector size (0x%x) invalid "
1047 "for configuration.", log->l_sectbb_log);
1048 goto out_free_log; 1089 goto out_free_log;
1049 } 1090 }
1050 if (mp->m_sb.sb_logsectlog < BBSHIFT) { 1091
1051 xlog_warn("XFS: Log sector log (0x%x) too small.", 1092 /* for larger sector sizes, must have v2 or external log */
1052 mp->m_sb.sb_logsectlog); 1093 if (log2_size && log->l_logBBstart > 0 &&
1094 !xfs_sb_version_haslogv2(&mp->m_sb)) {
1095
1096 xlog_warn("XFS: log sector size (0x%x) invalid "
1097 "for configuration.", log2_size);
1053 goto out_free_log; 1098 goto out_free_log;
1054 } 1099 }
1055 } 1100 }
1056 log->l_sectbb_mask = (1 << log->l_sectbb_log) - 1; 1101 log->l_sectBBsize = 1 << log2_size;
1057 1102
1058 xlog_get_iclog_buffer_size(mp, log); 1103 xlog_get_iclog_buffer_size(mp, log);
1059 1104
@@ -1133,6 +1178,9 @@ xlog_alloc_log(xfs_mount_t *mp,
1133 *iclogp = log->l_iclog; /* complete ring */ 1178 *iclogp = log->l_iclog; /* complete ring */
1134 log->l_iclog->ic_prev = prev_iclog; /* re-write 1st prev ptr */ 1179 log->l_iclog->ic_prev = prev_iclog; /* re-write 1st prev ptr */
1135 1180
1181 error = xlog_cil_init(log);
1182 if (error)
1183 goto out_free_iclog;
1136 return log; 1184 return log;
1137 1185
1138out_free_iclog: 1186out_free_iclog:
@@ -1160,26 +1208,31 @@ out:
1160 * ticket. Return the lsn of the commit record. 1208 * ticket. Return the lsn of the commit record.
1161 */ 1209 */
1162STATIC int 1210STATIC int
1163xlog_commit_record(xfs_mount_t *mp, 1211xlog_commit_record(
1164 xlog_ticket_t *ticket, 1212 struct log *log,
1165 xlog_in_core_t **iclog, 1213 struct xlog_ticket *ticket,
1166 xfs_lsn_t *commitlsnp) 1214 struct xlog_in_core **iclog,
1215 xfs_lsn_t *commitlsnp)
1167{ 1216{
1168 int error; 1217 struct xfs_mount *mp = log->l_mp;
1169 xfs_log_iovec_t reg[1]; 1218 int error;
1170 1219 struct xfs_log_iovec reg = {
1171 reg[0].i_addr = NULL; 1220 .i_addr = NULL,
1172 reg[0].i_len = 0; 1221 .i_len = 0,
1173 reg[0].i_type = XLOG_REG_TYPE_COMMIT; 1222 .i_type = XLOG_REG_TYPE_COMMIT,
1223 };
1224 struct xfs_log_vec vec = {
1225 .lv_niovecs = 1,
1226 .lv_iovecp = &reg,
1227 };
1174 1228
1175 ASSERT_ALWAYS(iclog); 1229 ASSERT_ALWAYS(iclog);
1176 if ((error = xlog_write(mp, reg, 1, ticket, commitlsnp, 1230 error = xlog_write(log, &vec, ticket, commitlsnp, iclog,
1177 iclog, XLOG_COMMIT_TRANS))) { 1231 XLOG_COMMIT_TRANS);
1232 if (error)
1178 xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); 1233 xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
1179 }
1180 return error; 1234 return error;
1181} /* xlog_commit_record */ 1235}
1182
1183 1236
1184/* 1237/*
1185 * Push on the buffer cache code if we ever use more than 75% of the on-disk 1238 * Push on the buffer cache code if we ever use more than 75% of the on-disk
@@ -1454,6 +1507,8 @@ xlog_dealloc_log(xlog_t *log)
1454 xlog_in_core_t *iclog, *next_iclog; 1507 xlog_in_core_t *iclog, *next_iclog;
1455 int i; 1508 int i;
1456 1509
1510 xlog_cil_destroy(log);
1511
1457 iclog = log->l_iclog; 1512 iclog = log->l_iclog;
1458 for (i=0; i<log->l_iclog_bufs; i++) { 1513 for (i=0; i<log->l_iclog_bufs; i++) {
1459 sv_destroy(&iclog->ic_force_wait); 1514 sv_destroy(&iclog->ic_force_wait);
@@ -1496,8 +1551,10 @@ xlog_state_finish_copy(xlog_t *log,
1496 * print out info relating to regions written which consume 1551 * print out info relating to regions written which consume
1497 * the reservation 1552 * the reservation
1498 */ 1553 */
1499STATIC void 1554void
1500xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket) 1555xlog_print_tic_res(
1556 struct xfs_mount *mp,
1557 struct xlog_ticket *ticket)
1501{ 1558{
1502 uint i; 1559 uint i;
1503 uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t); 1560 uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t);
@@ -1597,6 +1654,196 @@ xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket)
1597 "bad-rtype" : res_type_str[r_type-1]), 1654 "bad-rtype" : res_type_str[r_type-1]),
1598 ticket->t_res_arr[i].r_len); 1655 ticket->t_res_arr[i].r_len);
1599 } 1656 }
1657
1658 xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, mp,
1659 "xfs_log_write: reservation ran out. Need to up reservation");
1660 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
1661}
1662
1663/*
1664 * Calculate the potential space needed by the log vector. Each region gets
1665 * its own xlog_op_header_t and may need to be double word aligned.
1666 */
1667static int
1668xlog_write_calc_vec_length(
1669 struct xlog_ticket *ticket,
1670 struct xfs_log_vec *log_vector)
1671{
1672 struct xfs_log_vec *lv;
1673 int headers = 0;
1674 int len = 0;
1675 int i;
1676
1677 /* acct for start rec of xact */
1678 if (ticket->t_flags & XLOG_TIC_INITED)
1679 headers++;
1680
1681 for (lv = log_vector; lv; lv = lv->lv_next) {
1682 headers += lv->lv_niovecs;
1683
1684 for (i = 0; i < lv->lv_niovecs; i++) {
1685 struct xfs_log_iovec *vecp = &lv->lv_iovecp[i];
1686
1687 len += vecp->i_len;
1688 xlog_tic_add_region(ticket, vecp->i_len, vecp->i_type);
1689 }
1690 }
1691
1692 ticket->t_res_num_ophdrs += headers;
1693 len += headers * sizeof(struct xlog_op_header);
1694
1695 return len;
1696}
1697
1698/*
1699 * If first write for transaction, insert start record We can't be trying to
1700 * commit if we are inited. We can't have any "partial_copy" if we are inited.
1701 */
1702static int
1703xlog_write_start_rec(
1704 struct xlog_op_header *ophdr,
1705 struct xlog_ticket *ticket)
1706{
1707 if (!(ticket->t_flags & XLOG_TIC_INITED))
1708 return 0;
1709
1710 ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
1711 ophdr->oh_clientid = ticket->t_clientid;
1712 ophdr->oh_len = 0;
1713 ophdr->oh_flags = XLOG_START_TRANS;
1714 ophdr->oh_res2 = 0;
1715
1716 ticket->t_flags &= ~XLOG_TIC_INITED;
1717
1718 return sizeof(struct xlog_op_header);
1719}
1720
1721static xlog_op_header_t *
1722xlog_write_setup_ophdr(
1723 struct log *log,
1724 struct xlog_op_header *ophdr,
1725 struct xlog_ticket *ticket,
1726 uint flags)
1727{
1728 ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
1729 ophdr->oh_clientid = ticket->t_clientid;
1730 ophdr->oh_res2 = 0;
1731
1732 /* are we copying a commit or unmount record? */
1733 ophdr->oh_flags = flags;
1734
1735 /*
1736 * We've seen logs corrupted with bad transaction client ids. This
1737 * makes sure that XFS doesn't generate them on. Turn this into an EIO
1738 * and shut down the filesystem.
1739 */
1740 switch (ophdr->oh_clientid) {
1741 case XFS_TRANSACTION:
1742 case XFS_VOLUME:
1743 case XFS_LOG:
1744 break;
1745 default:
1746 xfs_fs_cmn_err(CE_WARN, log->l_mp,
1747 "Bad XFS transaction clientid 0x%x in ticket 0x%p",
1748 ophdr->oh_clientid, ticket);
1749 return NULL;
1750 }
1751
1752 return ophdr;
1753}
1754
1755/*
1756 * Set up the parameters of the region copy into the log. This has
1757 * to handle region write split across multiple log buffers - this
1758 * state is kept external to this function so that this code can
1759 * can be written in an obvious, self documenting manner.
1760 */
1761static int
1762xlog_write_setup_copy(
1763 struct xlog_ticket *ticket,
1764 struct xlog_op_header *ophdr,
1765 int space_available,
1766 int space_required,
1767 int *copy_off,
1768 int *copy_len,
1769 int *last_was_partial_copy,
1770 int *bytes_consumed)
1771{
1772 int still_to_copy;
1773
1774 still_to_copy = space_required - *bytes_consumed;
1775 *copy_off = *bytes_consumed;
1776
1777 if (still_to_copy <= space_available) {
1778 /* write of region completes here */
1779 *copy_len = still_to_copy;
1780 ophdr->oh_len = cpu_to_be32(*copy_len);
1781 if (*last_was_partial_copy)
1782 ophdr->oh_flags |= (XLOG_END_TRANS|XLOG_WAS_CONT_TRANS);
1783 *last_was_partial_copy = 0;
1784 *bytes_consumed = 0;
1785 return 0;
1786 }
1787
1788 /* partial write of region, needs extra log op header reservation */
1789 *copy_len = space_available;
1790 ophdr->oh_len = cpu_to_be32(*copy_len);
1791 ophdr->oh_flags |= XLOG_CONTINUE_TRANS;
1792 if (*last_was_partial_copy)
1793 ophdr->oh_flags |= XLOG_WAS_CONT_TRANS;
1794 *bytes_consumed += *copy_len;
1795 (*last_was_partial_copy)++;
1796
1797 /* account for new log op header */
1798 ticket->t_curr_res -= sizeof(struct xlog_op_header);
1799 ticket->t_res_num_ophdrs++;
1800
1801 return sizeof(struct xlog_op_header);
1802}
1803
1804static int
1805xlog_write_copy_finish(
1806 struct log *log,
1807 struct xlog_in_core *iclog,
1808 uint flags,
1809 int *record_cnt,
1810 int *data_cnt,
1811 int *partial_copy,
1812 int *partial_copy_len,
1813 int log_offset,
1814 struct xlog_in_core **commit_iclog)
1815{
1816 if (*partial_copy) {
1817 /*
1818 * This iclog has already been marked WANT_SYNC by
1819 * xlog_state_get_iclog_space.
1820 */
1821 xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt);
1822 *record_cnt = 0;
1823 *data_cnt = 0;
1824 return xlog_state_release_iclog(log, iclog);
1825 }
1826
1827 *partial_copy = 0;
1828 *partial_copy_len = 0;
1829
1830 if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) {
1831 /* no more space in this iclog - push it. */
1832 xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt);
1833 *record_cnt = 0;
1834 *data_cnt = 0;
1835
1836 spin_lock(&log->l_icloglock);
1837 xlog_state_want_sync(log, iclog);
1838 spin_unlock(&log->l_icloglock);
1839
1840 if (!commit_iclog)
1841 return xlog_state_release_iclog(log, iclog);
1842 ASSERT(flags & XLOG_COMMIT_TRANS);
1843 *commit_iclog = iclog;
1844 }
1845
1846 return 0;
1600} 1847}
1601 1848
1602/* 1849/*
@@ -1639,211 +1886,163 @@ xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket)
1639 * we don't update ic_offset until the end when we know exactly how many 1886 * we don't update ic_offset until the end when we know exactly how many
1640 * bytes have been written out. 1887 * bytes have been written out.
1641 */ 1888 */
1642STATIC int 1889int
1643xlog_write( 1890xlog_write(
1644 struct xfs_mount *mp, 1891 struct log *log,
1645 struct xfs_log_iovec reg[], 1892 struct xfs_log_vec *log_vector,
1646 int nentries,
1647 struct xlog_ticket *ticket, 1893 struct xlog_ticket *ticket,
1648 xfs_lsn_t *start_lsn, 1894 xfs_lsn_t *start_lsn,
1649 struct xlog_in_core **commit_iclog, 1895 struct xlog_in_core **commit_iclog,
1650 uint flags) 1896 uint flags)
1651{ 1897{
1652 xlog_t *log = mp->m_log; 1898 struct xlog_in_core *iclog = NULL;
1653 xlog_in_core_t *iclog = NULL; /* ptr to current in-core log */ 1899 struct xfs_log_iovec *vecp;
1654 xlog_op_header_t *logop_head; /* ptr to log operation header */ 1900 struct xfs_log_vec *lv;
1655 __psint_t ptr; /* copy address into data region */ 1901 int len;
1656 int len; /* # xlog_write() bytes 2 still copy */ 1902 int index;
1657 int index; /* region index currently copying */ 1903 int partial_copy = 0;
1658 int log_offset; /* offset (from 0) into data region */ 1904 int partial_copy_len = 0;
1659 int start_rec_copy; /* # bytes to copy for start record */ 1905 int contwr = 0;
1660 int partial_copy; /* did we split a region? */ 1906 int record_cnt = 0;
1661 int partial_copy_len;/* # bytes copied if split region */ 1907 int data_cnt = 0;
1662 int need_copy; /* # bytes need to memcpy this region */ 1908 int error;
1663 int copy_len; /* # bytes actually memcpy'ing */
1664 int copy_off; /* # bytes from entry start */
1665 int contwr; /* continued write of in-core log? */
1666 int error;
1667 int record_cnt = 0, data_cnt = 0;
1668
1669 partial_copy_len = partial_copy = 0;
1670
1671 /* Calculate potential maximum space. Each region gets its own
1672 * xlog_op_header_t and may need to be double word aligned.
1673 */
1674 len = 0;
1675 if (ticket->t_flags & XLOG_TIC_INITED) { /* acct for start rec of xact */
1676 len += sizeof(xlog_op_header_t);
1677 ticket->t_res_num_ophdrs++;
1678 }
1679
1680 for (index = 0; index < nentries; index++) {
1681 len += sizeof(xlog_op_header_t); /* each region gets >= 1 */
1682 ticket->t_res_num_ophdrs++;
1683 len += reg[index].i_len;
1684 xlog_tic_add_region(ticket, reg[index].i_len, reg[index].i_type);
1685 }
1686 contwr = *start_lsn = 0;
1687 1909
1688 if (ticket->t_curr_res < len) { 1910 *start_lsn = 0;
1689 xlog_print_tic_res(mp, ticket);
1690#ifdef DEBUG
1691 xlog_panic(
1692 "xfs_log_write: reservation ran out. Need to up reservation");
1693#else
1694 /* Customer configurable panic */
1695 xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, mp,
1696 "xfs_log_write: reservation ran out. Need to up reservation");
1697 /* If we did not panic, shutdown the filesystem */
1698 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
1699#endif
1700 } else
1701 ticket->t_curr_res -= len;
1702 1911
1703 for (index = 0; index < nentries; ) { 1912 len = xlog_write_calc_vec_length(ticket, log_vector);
1704 if ((error = xlog_state_get_iclog_space(log, len, &iclog, ticket, 1913 if (log->l_cilp) {
1705 &contwr, &log_offset))) 1914 /*
1706 return error; 1915 * Region headers and bytes are already accounted for.
1916 * We only need to take into account start records and
1917 * split regions in this function.
1918 */
1919 if (ticket->t_flags & XLOG_TIC_INITED)
1920 ticket->t_curr_res -= sizeof(xlog_op_header_t);
1707 1921
1708 ASSERT(log_offset <= iclog->ic_size - 1); 1922 /*
1709 ptr = (__psint_t) ((char *)iclog->ic_datap+log_offset); 1923 * Commit record headers need to be accounted for. These
1924 * come in as separate writes so are easy to detect.
1925 */
1926 if (flags & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS))
1927 ticket->t_curr_res -= sizeof(xlog_op_header_t);
1928 } else
1929 ticket->t_curr_res -= len;
1930
1931 if (ticket->t_curr_res < 0)
1932 xlog_print_tic_res(log->l_mp, ticket);
1933
1934 index = 0;
1935 lv = log_vector;
1936 vecp = lv->lv_iovecp;
1937 while (lv && index < lv->lv_niovecs) {
1938 void *ptr;
1939 int log_offset;
1940
1941 error = xlog_state_get_iclog_space(log, len, &iclog, ticket,
1942 &contwr, &log_offset);
1943 if (error)
1944 return error;
1710 1945
1711 /* start_lsn is the first lsn written to. That's all we need. */ 1946 ASSERT(log_offset <= iclog->ic_size - 1);
1712 if (! *start_lsn) 1947 ptr = iclog->ic_datap + log_offset;
1713 *start_lsn = be64_to_cpu(iclog->ic_header.h_lsn);
1714 1948
1715 /* This loop writes out as many regions as can fit in the amount 1949 /* start_lsn is the first lsn written to. That's all we need. */
1716 * of space which was allocated by xlog_state_get_iclog_space(). 1950 if (!*start_lsn)
1717 */ 1951 *start_lsn = be64_to_cpu(iclog->ic_header.h_lsn);
1718 while (index < nentries) {
1719 ASSERT(reg[index].i_len % sizeof(__int32_t) == 0);
1720 ASSERT((__psint_t)ptr % sizeof(__int32_t) == 0);
1721 start_rec_copy = 0;
1722
1723 /* If first write for transaction, insert start record.
1724 * We can't be trying to commit if we are inited. We can't
1725 * have any "partial_copy" if we are inited.
1726 */
1727 if (ticket->t_flags & XLOG_TIC_INITED) {
1728 logop_head = (xlog_op_header_t *)ptr;
1729 logop_head->oh_tid = cpu_to_be32(ticket->t_tid);
1730 logop_head->oh_clientid = ticket->t_clientid;
1731 logop_head->oh_len = 0;
1732 logop_head->oh_flags = XLOG_START_TRANS;
1733 logop_head->oh_res2 = 0;
1734 ticket->t_flags &= ~XLOG_TIC_INITED; /* clear bit */
1735 record_cnt++;
1736
1737 start_rec_copy = sizeof(xlog_op_header_t);
1738 xlog_write_adv_cnt(ptr, len, log_offset, start_rec_copy);
1739 }
1740 1952
1741 /* Copy log operation header directly into data section */ 1953 /*
1742 logop_head = (xlog_op_header_t *)ptr; 1954 * This loop writes out as many regions as can fit in the amount
1743 logop_head->oh_tid = cpu_to_be32(ticket->t_tid); 1955 * of space which was allocated by xlog_state_get_iclog_space().
1744 logop_head->oh_clientid = ticket->t_clientid; 1956 */
1745 logop_head->oh_res2 = 0; 1957 while (lv && index < lv->lv_niovecs) {
1958 struct xfs_log_iovec *reg = &vecp[index];
1959 struct xlog_op_header *ophdr;
1960 int start_rec_copy;
1961 int copy_len;
1962 int copy_off;
1963
1964 ASSERT(reg->i_len % sizeof(__int32_t) == 0);
1965 ASSERT((unsigned long)ptr % sizeof(__int32_t) == 0);
1966
1967 start_rec_copy = xlog_write_start_rec(ptr, ticket);
1968 if (start_rec_copy) {
1969 record_cnt++;
1970 xlog_write_adv_cnt(&ptr, &len, &log_offset,
1971 start_rec_copy);
1972 }
1746 1973
1747 /* header copied directly */ 1974 ophdr = xlog_write_setup_ophdr(log, ptr, ticket, flags);
1748 xlog_write_adv_cnt(ptr, len, log_offset, sizeof(xlog_op_header_t)); 1975 if (!ophdr)
1976 return XFS_ERROR(EIO);
1749 1977
1750 /* are we copying a commit or unmount record? */ 1978 xlog_write_adv_cnt(&ptr, &len, &log_offset,
1751 logop_head->oh_flags = flags; 1979 sizeof(struct xlog_op_header));
1980
1981 len += xlog_write_setup_copy(ticket, ophdr,
1982 iclog->ic_size-log_offset,
1983 reg->i_len,
1984 &copy_off, &copy_len,
1985 &partial_copy,
1986 &partial_copy_len);
1987 xlog_verify_dest_ptr(log, ptr);
1988
1989 /* copy region */
1990 ASSERT(copy_len >= 0);
1991 memcpy(ptr, reg->i_addr + copy_off, copy_len);
1992 xlog_write_adv_cnt(&ptr, &len, &log_offset, copy_len);
1993
1994 copy_len += start_rec_copy + sizeof(xlog_op_header_t);
1995 record_cnt++;
1996 data_cnt += contwr ? copy_len : 0;
1997
1998 error = xlog_write_copy_finish(log, iclog, flags,
1999 &record_cnt, &data_cnt,
2000 &partial_copy,
2001 &partial_copy_len,
2002 log_offset,
2003 commit_iclog);
2004 if (error)
2005 return error;
1752 2006
1753 /* 2007 /*
1754 * We've seen logs corrupted with bad transaction client 2008 * if we had a partial copy, we need to get more iclog
1755 * ids. This makes sure that XFS doesn't generate them on. 2009 * space but we don't want to increment the region
1756 * Turn this into an EIO and shut down the filesystem. 2010 * index because there is still more is this region to
1757 */ 2011 * write.
1758 switch (logop_head->oh_clientid) { 2012 *
1759 case XFS_TRANSACTION: 2013 * If we completed writing this region, and we flushed
1760 case XFS_VOLUME: 2014 * the iclog (indicated by resetting of the record
1761 case XFS_LOG: 2015 * count), then we also need to get more log space. If
1762 break; 2016 * this was the last record, though, we are done and
1763 default: 2017 * can just return.
1764 xfs_fs_cmn_err(CE_WARN, mp, 2018 */
1765 "Bad XFS transaction clientid 0x%x in ticket 0x%p", 2019 if (partial_copy)
1766 logop_head->oh_clientid, ticket); 2020 break;
1767 return XFS_ERROR(EIO);
1768 }
1769 2021
1770 /* Partial write last time? => (partial_copy != 0) 2022 if (++index == lv->lv_niovecs) {
1771 * need_copy is the amount we'd like to copy if everything could 2023 lv = lv->lv_next;
1772 * fit in the current memcpy. 2024 index = 0;
1773 */ 2025 if (lv)
1774 need_copy = reg[index].i_len - partial_copy_len; 2026 vecp = lv->lv_iovecp;
1775 2027 }
1776 copy_off = partial_copy_len; 2028 if (record_cnt == 0) {
1777 if (need_copy <= iclog->ic_size - log_offset) { /*complete write */ 2029 if (!lv)
1778 copy_len = need_copy; 2030 return 0;
1779 logop_head->oh_len = cpu_to_be32(copy_len); 2031 break;
1780 if (partial_copy) 2032 }
1781 logop_head->oh_flags|= (XLOG_END_TRANS|XLOG_WAS_CONT_TRANS);
1782 partial_copy_len = partial_copy = 0;
1783 } else { /* partial write */
1784 copy_len = iclog->ic_size - log_offset;
1785 logop_head->oh_len = cpu_to_be32(copy_len);
1786 logop_head->oh_flags |= XLOG_CONTINUE_TRANS;
1787 if (partial_copy)
1788 logop_head->oh_flags |= XLOG_WAS_CONT_TRANS;
1789 partial_copy_len += copy_len;
1790 partial_copy++;
1791 len += sizeof(xlog_op_header_t); /* from splitting of region */
1792 /* account for new log op header */
1793 ticket->t_curr_res -= sizeof(xlog_op_header_t);
1794 ticket->t_res_num_ophdrs++;
1795 }
1796 xlog_verify_dest_ptr(log, ptr);
1797
1798 /* copy region */
1799 ASSERT(copy_len >= 0);
1800 memcpy((xfs_caddr_t)ptr, reg[index].i_addr + copy_off, copy_len);
1801 xlog_write_adv_cnt(ptr, len, log_offset, copy_len);
1802
1803 /* make copy_len total bytes copied, including headers */
1804 copy_len += start_rec_copy + sizeof(xlog_op_header_t);
1805 record_cnt++;
1806 data_cnt += contwr ? copy_len : 0;
1807 if (partial_copy) { /* copied partial region */
1808 /* already marked WANT_SYNC by xlog_state_get_iclog_space */
1809 xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
1810 record_cnt = data_cnt = 0;
1811 if ((error = xlog_state_release_iclog(log, iclog)))
1812 return error;
1813 break; /* don't increment index */
1814 } else { /* copied entire region */
1815 index++;
1816 partial_copy_len = partial_copy = 0;
1817
1818 if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) {
1819 xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
1820 record_cnt = data_cnt = 0;
1821 spin_lock(&log->l_icloglock);
1822 xlog_state_want_sync(log, iclog);
1823 spin_unlock(&log->l_icloglock);
1824 if (commit_iclog) {
1825 ASSERT(flags & XLOG_COMMIT_TRANS);
1826 *commit_iclog = iclog;
1827 } else if ((error = xlog_state_release_iclog(log, iclog)))
1828 return error;
1829 if (index == nentries)
1830 return 0; /* we are done */
1831 else
1832 break;
1833 } 2033 }
1834 } /* if (partial_copy) */ 2034 }
1835 } /* while (index < nentries) */ 2035
1836 } /* for (index = 0; index < nentries; ) */ 2036 ASSERT(len == 0);
1837 ASSERT(len == 0); 2037
2038 xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
2039 if (!commit_iclog)
2040 return xlog_state_release_iclog(log, iclog);
1838 2041
1839 xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
1840 if (commit_iclog) {
1841 ASSERT(flags & XLOG_COMMIT_TRANS); 2042 ASSERT(flags & XLOG_COMMIT_TRANS);
1842 *commit_iclog = iclog; 2043 *commit_iclog = iclog;
1843 return 0; 2044 return 0;
1844 } 2045}
1845 return xlog_state_release_iclog(log, iclog);
1846} /* xlog_write */
1847 2046
1848 2047
1849/***************************************************************************** 2048/*****************************************************************************
@@ -2826,6 +3025,8 @@ _xfs_log_force(
2826 3025
2827 XFS_STATS_INC(xs_log_force); 3026 XFS_STATS_INC(xs_log_force);
2828 3027
3028 xlog_cil_push(log, 1);
3029
2829 spin_lock(&log->l_icloglock); 3030 spin_lock(&log->l_icloglock);
2830 3031
2831 iclog = log->l_iclog; 3032 iclog = log->l_iclog;
@@ -2975,6 +3176,12 @@ _xfs_log_force_lsn(
2975 3176
2976 XFS_STATS_INC(xs_log_force); 3177 XFS_STATS_INC(xs_log_force);
2977 3178
3179 if (log->l_cilp) {
3180 lsn = xlog_cil_push_lsn(log, lsn);
3181 if (lsn == NULLCOMMITLSN)
3182 return 0;
3183 }
3184
2978try_again: 3185try_again:
2979 spin_lock(&log->l_icloglock); 3186 spin_lock(&log->l_icloglock);
2980 iclog = log->l_iclog; 3187 iclog = log->l_iclog;
@@ -3139,20 +3346,30 @@ xfs_log_ticket_get(
3139 return ticket; 3346 return ticket;
3140} 3347}
3141 3348
3349xlog_tid_t
3350xfs_log_get_trans_ident(
3351 struct xfs_trans *tp)
3352{
3353 return tp->t_ticket->t_tid;
3354}
3355
3142/* 3356/*
3143 * Allocate and initialise a new log ticket. 3357 * Allocate and initialise a new log ticket.
3144 */ 3358 */
3145STATIC xlog_ticket_t * 3359xlog_ticket_t *
3146xlog_ticket_alloc(xlog_t *log, 3360xlog_ticket_alloc(
3147 int unit_bytes, 3361 struct log *log,
3148 int cnt, 3362 int unit_bytes,
3149 char client, 3363 int cnt,
3150 uint xflags) 3364 char client,
3365 uint xflags,
3366 int alloc_flags)
3151{ 3367{
3152 xlog_ticket_t *tic; 3368 struct xlog_ticket *tic;
3153 uint num_headers; 3369 uint num_headers;
3370 int iclog_space;
3154 3371
3155 tic = kmem_zone_zalloc(xfs_log_ticket_zone, KM_SLEEP|KM_MAYFAIL); 3372 tic = kmem_zone_zalloc(xfs_log_ticket_zone, alloc_flags);
3156 if (!tic) 3373 if (!tic)
3157 return NULL; 3374 return NULL;
3158 3375
@@ -3194,16 +3411,40 @@ xlog_ticket_alloc(xlog_t *log,
3194 /* for start-rec */ 3411 /* for start-rec */
3195 unit_bytes += sizeof(xlog_op_header_t); 3412 unit_bytes += sizeof(xlog_op_header_t);
3196 3413
3197 /* for LR headers */ 3414 /*
3198 num_headers = ((unit_bytes + log->l_iclog_size-1) >> log->l_iclog_size_log); 3415 * for LR headers - the space for data in an iclog is the size minus
3416 * the space used for the headers. If we use the iclog size, then we
3417 * undercalculate the number of headers required.
3418 *
3419 * Furthermore - the addition of op headers for split-recs might
3420 * increase the space required enough to require more log and op
3421 * headers, so take that into account too.
3422 *
3423 * IMPORTANT: This reservation makes the assumption that if this
3424 * transaction is the first in an iclog and hence has the LR headers
3425 * accounted to it, then the remaining space in the iclog is
3426 * exclusively for this transaction. i.e. if the transaction is larger
3427 * than the iclog, it will be the only thing in that iclog.
3428 * Fundamentally, this means we must pass the entire log vector to
3429 * xlog_write to guarantee this.
3430 */
3431 iclog_space = log->l_iclog_size - log->l_iclog_hsize;
3432 num_headers = howmany(unit_bytes, iclog_space);
3433
3434 /* for split-recs - ophdrs added when data split over LRs */
3435 unit_bytes += sizeof(xlog_op_header_t) * num_headers;
3436
3437 /* add extra header reservations if we overrun */
3438 while (!num_headers ||
3439 howmany(unit_bytes, iclog_space) > num_headers) {
3440 unit_bytes += sizeof(xlog_op_header_t);
3441 num_headers++;
3442 }
3199 unit_bytes += log->l_iclog_hsize * num_headers; 3443 unit_bytes += log->l_iclog_hsize * num_headers;
3200 3444
3201 /* for commit-rec LR header - note: padding will subsume the ophdr */ 3445 /* for commit-rec LR header - note: padding will subsume the ophdr */
3202 unit_bytes += log->l_iclog_hsize; 3446 unit_bytes += log->l_iclog_hsize;
3203 3447
3204 /* for split-recs - ophdrs added when data split over LRs */
3205 unit_bytes += sizeof(xlog_op_header_t) * num_headers;
3206
3207 /* for roundoff padding for transaction data and one for commit record */ 3448 /* for roundoff padding for transaction data and one for commit record */
3208 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb) && 3449 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb) &&
3209 log->l_mp->m_sb.sb_logsunit > 1) { 3450 log->l_mp->m_sb.sb_logsunit > 1) {
@@ -3219,13 +3460,13 @@ xlog_ticket_alloc(xlog_t *log,
3219 tic->t_curr_res = unit_bytes; 3460 tic->t_curr_res = unit_bytes;
3220 tic->t_cnt = cnt; 3461 tic->t_cnt = cnt;
3221 tic->t_ocnt = cnt; 3462 tic->t_ocnt = cnt;
3222 tic->t_tid = (xlog_tid_t)((__psint_t)tic & 0xffffffff); 3463 tic->t_tid = random32();
3223 tic->t_clientid = client; 3464 tic->t_clientid = client;
3224 tic->t_flags = XLOG_TIC_INITED; 3465 tic->t_flags = XLOG_TIC_INITED;
3225 tic->t_trans_type = 0; 3466 tic->t_trans_type = 0;
3226 if (xflags & XFS_LOG_PERM_RESERV) 3467 if (xflags & XFS_LOG_PERM_RESERV)
3227 tic->t_flags |= XLOG_TIC_PERM_RESERV; 3468 tic->t_flags |= XLOG_TIC_PERM_RESERV;
3228 sv_init(&(tic->t_wait), SV_DEFAULT, "logtick"); 3469 sv_init(&tic->t_wait, SV_DEFAULT, "logtick");
3229 3470
3230 xlog_tic_reset_res(tic); 3471 xlog_tic_reset_res(tic);
3231 3472
@@ -3246,20 +3487,22 @@ xlog_ticket_alloc(xlog_t *log,
3246 * part of the log in case we trash the log structure. 3487 * part of the log in case we trash the log structure.
3247 */ 3488 */
3248void 3489void
3249xlog_verify_dest_ptr(xlog_t *log, 3490xlog_verify_dest_ptr(
3250 __psint_t ptr) 3491 struct log *log,
3492 char *ptr)
3251{ 3493{
3252 int i; 3494 int i;
3253 int good_ptr = 0; 3495 int good_ptr = 0;
3254 3496
3255 for (i=0; i < log->l_iclog_bufs; i++) { 3497 for (i = 0; i < log->l_iclog_bufs; i++) {
3256 if (ptr >= (__psint_t)log->l_iclog_bak[i] && 3498 if (ptr >= log->l_iclog_bak[i] &&
3257 ptr <= (__psint_t)log->l_iclog_bak[i]+log->l_iclog_size) 3499 ptr <= log->l_iclog_bak[i] + log->l_iclog_size)
3258 good_ptr++; 3500 good_ptr++;
3259 } 3501 }
3260 if (! good_ptr) 3502
3503 if (!good_ptr)
3261 xlog_panic("xlog_verify_dest_ptr: invalid ptr"); 3504 xlog_panic("xlog_verify_dest_ptr: invalid ptr");
3262} /* xlog_verify_dest_ptr */ 3505}
3263 3506
3264STATIC void 3507STATIC void
3265xlog_verify_grant_head(xlog_t *log, int equals) 3508xlog_verify_grant_head(xlog_t *log, int equals)
@@ -3445,6 +3688,11 @@ xlog_state_ioerror(
3445 * c. nothing new gets queued up after (a) and (b) are done. 3688 * c. nothing new gets queued up after (a) and (b) are done.
3446 * d. if !logerror, flush the iclogs to disk, then seal them off 3689 * d. if !logerror, flush the iclogs to disk, then seal them off
3447 * for business. 3690 * for business.
3691 *
3692 * Note: for delayed logging the !logerror case needs to flush the regions
3693 * held in memory out to the iclogs before flushing them to disk. This needs
3694 * to be done before the log is marked as shutdown, otherwise the flush to the
3695 * iclogs will fail.
3448 */ 3696 */
3449int 3697int
3450xfs_log_force_umount( 3698xfs_log_force_umount(
@@ -3478,6 +3726,16 @@ xfs_log_force_umount(
3478 return 1; 3726 return 1;
3479 } 3727 }
3480 retval = 0; 3728 retval = 0;
3729
3730 /*
3731 * Flush the in memory commit item list before marking the log as
3732 * being shut down. We need to do it in this order to ensure all the
3733 * completed transactions are flushed to disk with the xfs_log_force()
3734 * call below.
3735 */
3736 if (!logerror && (mp->m_flags & XFS_MOUNT_DELAYLOG))
3737 xlog_cil_push(log, 1);
3738
3481 /* 3739 /*
3482 * We must hold both the GRANT lock and the LOG lock, 3740 * We must hold both the GRANT lock and the LOG lock,
3483 * before we mark the filesystem SHUTDOWN and wake 3741 * before we mark the filesystem SHUTDOWN and wake
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 97a24c7795a4..04c78e642cc8 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -19,7 +19,6 @@
19#define __XFS_LOG_H__ 19#define __XFS_LOG_H__
20 20
21/* get lsn fields */ 21/* get lsn fields */
22
23#define CYCLE_LSN(lsn) ((uint)((lsn)>>32)) 22#define CYCLE_LSN(lsn) ((uint)((lsn)>>32))
24#define BLOCK_LSN(lsn) ((uint)(lsn)) 23#define BLOCK_LSN(lsn) ((uint)(lsn))
25 24
@@ -110,6 +109,15 @@ typedef struct xfs_log_iovec {
110 uint i_type; /* type of region */ 109 uint i_type; /* type of region */
111} xfs_log_iovec_t; 110} xfs_log_iovec_t;
112 111
112struct xfs_log_vec {
113 struct xfs_log_vec *lv_next; /* next lv in build list */
114 int lv_niovecs; /* number of iovecs in lv */
115 struct xfs_log_iovec *lv_iovecp; /* iovec array */
116 struct xfs_log_item *lv_item; /* owner */
117 char *lv_buf; /* formatted buffer */
118 int lv_buf_len; /* size of formatted buffer */
119};
120
113/* 121/*
114 * Structure used to pass callback function and the function's argument 122 * Structure used to pass callback function and the function's argument
115 * to the log manager. 123 * to the log manager.
@@ -126,6 +134,14 @@ typedef struct xfs_log_callback {
126struct xfs_mount; 134struct xfs_mount;
127struct xlog_in_core; 135struct xlog_in_core;
128struct xlog_ticket; 136struct xlog_ticket;
137struct xfs_log_item;
138struct xfs_item_ops;
139struct xfs_trans;
140
141void xfs_log_item_init(struct xfs_mount *mp,
142 struct xfs_log_item *item,
143 int type,
144 struct xfs_item_ops *ops);
129 145
130xfs_lsn_t xfs_log_done(struct xfs_mount *mp, 146xfs_lsn_t xfs_log_done(struct xfs_mount *mp,
131 struct xlog_ticket *ticket, 147 struct xlog_ticket *ticket,
@@ -174,9 +190,16 @@ int xfs_log_need_covered(struct xfs_mount *mp);
174 190
175void xlog_iodone(struct xfs_buf *); 191void xlog_iodone(struct xfs_buf *);
176 192
177struct xlog_ticket * xfs_log_ticket_get(struct xlog_ticket *ticket); 193struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
178void xfs_log_ticket_put(struct xlog_ticket *ticket); 194void xfs_log_ticket_put(struct xlog_ticket *ticket);
179 195
196xlog_tid_t xfs_log_get_trans_ident(struct xfs_trans *tp);
197
198int xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
199 struct xfs_log_vec *log_vector,
200 xfs_lsn_t *commit_lsn, int flags);
201bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
202
180#endif 203#endif
181 204
182 205
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
new file mode 100644
index 000000000000..bb17cc044bf3
--- /dev/null
+++ b/fs/xfs/xfs_log_cil.c
@@ -0,0 +1,725 @@
1/*
2 * Copyright (c) 2010 Red Hat, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write the Free Software Foundation,
15 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
16 */
17
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_types.h"
21#include "xfs_bit.h"
22#include "xfs_log.h"
23#include "xfs_inum.h"
24#include "xfs_trans.h"
25#include "xfs_trans_priv.h"
26#include "xfs_log_priv.h"
27#include "xfs_sb.h"
28#include "xfs_ag.h"
29#include "xfs_dir2.h"
30#include "xfs_dmapi.h"
31#include "xfs_mount.h"
32#include "xfs_error.h"
33#include "xfs_alloc.h"
34
35/*
36 * Perform initial CIL structure initialisation. If the CIL is not
37 * enabled in this filesystem, ensure the log->l_cilp is null so
38 * we can check this conditional to determine if we are doing delayed
39 * logging or not.
40 */
41int
42xlog_cil_init(
43 struct log *log)
44{
45 struct xfs_cil *cil;
46 struct xfs_cil_ctx *ctx;
47
48 log->l_cilp = NULL;
49 if (!(log->l_mp->m_flags & XFS_MOUNT_DELAYLOG))
50 return 0;
51
52 cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL);
53 if (!cil)
54 return ENOMEM;
55
56 ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL);
57 if (!ctx) {
58 kmem_free(cil);
59 return ENOMEM;
60 }
61
62 INIT_LIST_HEAD(&cil->xc_cil);
63 INIT_LIST_HEAD(&cil->xc_committing);
64 spin_lock_init(&cil->xc_cil_lock);
65 init_rwsem(&cil->xc_ctx_lock);
66 sv_init(&cil->xc_commit_wait, SV_DEFAULT, "cilwait");
67
68 INIT_LIST_HEAD(&ctx->committing);
69 INIT_LIST_HEAD(&ctx->busy_extents);
70 ctx->sequence = 1;
71 ctx->cil = cil;
72 cil->xc_ctx = ctx;
73
74 cil->xc_log = log;
75 log->l_cilp = cil;
76 return 0;
77}
78
79void
80xlog_cil_destroy(
81 struct log *log)
82{
83 if (!log->l_cilp)
84 return;
85
86 if (log->l_cilp->xc_ctx) {
87 if (log->l_cilp->xc_ctx->ticket)
88 xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket);
89 kmem_free(log->l_cilp->xc_ctx);
90 }
91
92 ASSERT(list_empty(&log->l_cilp->xc_cil));
93 kmem_free(log->l_cilp);
94}
95
96/*
97 * Allocate a new ticket. Failing to get a new ticket makes it really hard to
98 * recover, so we don't allow failure here. Also, we allocate in a context that
99 * we don't want to be issuing transactions from, so we need to tell the
100 * allocation code this as well.
101 *
102 * We don't reserve any space for the ticket - we are going to steal whatever
103 * space we require from transactions as they commit. To ensure we reserve all
104 * the space required, we need to set the current reservation of the ticket to
105 * zero so that we know to steal the initial transaction overhead from the
106 * first transaction commit.
107 */
108static struct xlog_ticket *
109xlog_cil_ticket_alloc(
110 struct log *log)
111{
112 struct xlog_ticket *tic;
113
114 tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0,
115 KM_SLEEP|KM_NOFS);
116 tic->t_trans_type = XFS_TRANS_CHECKPOINT;
117
118 /*
119 * set the current reservation to zero so we know to steal the basic
120 * transaction overhead reservation from the first transaction commit.
121 */
122 tic->t_curr_res = 0;
123 return tic;
124}
125
126/*
127 * After the first stage of log recovery is done, we know where the head and
128 * tail of the log are. We need this log initialisation done before we can
129 * initialise the first CIL checkpoint context.
130 *
131 * Here we allocate a log ticket to track space usage during a CIL push. This
132 * ticket is passed to xlog_write() directly so that we don't slowly leak log
133 * space by failing to account for space used by log headers and additional
134 * region headers for split regions.
135 */
136void
137xlog_cil_init_post_recovery(
138 struct log *log)
139{
140 if (!log->l_cilp)
141 return;
142
143 log->l_cilp->xc_ctx->ticket = xlog_cil_ticket_alloc(log);
144 log->l_cilp->xc_ctx->sequence = 1;
145 log->l_cilp->xc_ctx->commit_lsn = xlog_assign_lsn(log->l_curr_cycle,
146 log->l_curr_block);
147}
148
149/*
150 * Insert the log item into the CIL and calculate the difference in space
151 * consumed by the item. Add the space to the checkpoint ticket and calculate
152 * if the change requires additional log metadata. If it does, take that space
153 * as well. Remove the amount of space we addded to the checkpoint ticket from
154 * the current transaction ticket so that the accounting works out correctly.
155 *
156 * If this is the first time the item is being placed into the CIL in this
157 * context, pin it so it can't be written to disk until the CIL is flushed to
158 * the iclog and the iclog written to disk.
159 */
160static void
161xlog_cil_insert(
162 struct log *log,
163 struct xlog_ticket *ticket,
164 struct xfs_log_item *item,
165 struct xfs_log_vec *lv)
166{
167 struct xfs_cil *cil = log->l_cilp;
168 struct xfs_log_vec *old = lv->lv_item->li_lv;
169 struct xfs_cil_ctx *ctx = cil->xc_ctx;
170 int len;
171 int diff_iovecs;
172 int iclog_space;
173
174 if (old) {
175 /* existing lv on log item, space used is a delta */
176 ASSERT(!list_empty(&item->li_cil));
177 ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs);
178
179 len = lv->lv_buf_len - old->lv_buf_len;
180 diff_iovecs = lv->lv_niovecs - old->lv_niovecs;
181 kmem_free(old->lv_buf);
182 kmem_free(old);
183 } else {
184 /* new lv, must pin the log item */
185 ASSERT(!lv->lv_item->li_lv);
186 ASSERT(list_empty(&item->li_cil));
187
188 len = lv->lv_buf_len;
189 diff_iovecs = lv->lv_niovecs;
190 IOP_PIN(lv->lv_item);
191
192 }
193 len += diff_iovecs * sizeof(xlog_op_header_t);
194
195 /* attach new log vector to log item */
196 lv->lv_item->li_lv = lv;
197
198 spin_lock(&cil->xc_cil_lock);
199 list_move_tail(&item->li_cil, &cil->xc_cil);
200 ctx->nvecs += diff_iovecs;
201
202 /*
203 * If this is the first time the item is being committed to the CIL,
204 * store the sequence number on the log item so we can tell
205 * in future commits whether this is the first checkpoint the item is
206 * being committed into.
207 */
208 if (!item->li_seq)
209 item->li_seq = ctx->sequence;
210
211 /*
212 * Now transfer enough transaction reservation to the context ticket
213 * for the checkpoint. The context ticket is special - the unit
214 * reservation has to grow as well as the current reservation as we
215 * steal from tickets so we can correctly determine the space used
216 * during the transaction commit.
217 */
218 if (ctx->ticket->t_curr_res == 0) {
219 /* first commit in checkpoint, steal the header reservation */
220 ASSERT(ticket->t_curr_res >= ctx->ticket->t_unit_res + len);
221 ctx->ticket->t_curr_res = ctx->ticket->t_unit_res;
222 ticket->t_curr_res -= ctx->ticket->t_unit_res;
223 }
224
225 /* do we need space for more log record headers? */
226 iclog_space = log->l_iclog_size - log->l_iclog_hsize;
227 if (len > 0 && (ctx->space_used / iclog_space !=
228 (ctx->space_used + len) / iclog_space)) {
229 int hdrs;
230
231 hdrs = (len + iclog_space - 1) / iclog_space;
232 /* need to take into account split region headers, too */
233 hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header);
234 ctx->ticket->t_unit_res += hdrs;
235 ctx->ticket->t_curr_res += hdrs;
236 ticket->t_curr_res -= hdrs;
237 ASSERT(ticket->t_curr_res >= len);
238 }
239 ticket->t_curr_res -= len;
240 ctx->space_used += len;
241
242 spin_unlock(&cil->xc_cil_lock);
243}
244
245/*
246 * Format log item into a flat buffers
247 *
248 * For delayed logging, we need to hold a formatted buffer containing all the
249 * changes on the log item. This enables us to relog the item in memory and
250 * write it out asynchronously without needing to relock the object that was
251 * modified at the time it gets written into the iclog.
252 *
253 * This function builds a vector for the changes in each log item in the
254 * transaction. It then works out the length of the buffer needed for each log
255 * item, allocates them and formats the vector for the item into the buffer.
256 * The buffer is then attached to the log item are then inserted into the
257 * Committed Item List for tracking until the next checkpoint is written out.
258 *
259 * We don't set up region headers during this process; we simply copy the
260 * regions into the flat buffer. We can do this because we still have to do a
261 * formatting step to write the regions into the iclog buffer. Writing the
262 * ophdrs during the iclog write means that we can support splitting large
263 * regions across iclog boundares without needing a change in the format of the
264 * item/region encapsulation.
265 *
266 * Hence what we need to do now is change the rewrite the vector array to point
267 * to the copied region inside the buffer we just allocated. This allows us to
268 * format the regions into the iclog as though they are being formatted
269 * directly out of the objects themselves.
270 */
271static void
272xlog_cil_format_items(
273 struct log *log,
274 struct xfs_log_vec *log_vector,
275 struct xlog_ticket *ticket,
276 xfs_lsn_t *start_lsn)
277{
278 struct xfs_log_vec *lv;
279
280 if (start_lsn)
281 *start_lsn = log->l_cilp->xc_ctx->sequence;
282
283 ASSERT(log_vector);
284 for (lv = log_vector; lv; lv = lv->lv_next) {
285 void *ptr;
286 int index;
287 int len = 0;
288
289 /* build the vector array and calculate it's length */
290 IOP_FORMAT(lv->lv_item, lv->lv_iovecp);
291 for (index = 0; index < lv->lv_niovecs; index++)
292 len += lv->lv_iovecp[index].i_len;
293
294 lv->lv_buf_len = len;
295 lv->lv_buf = kmem_zalloc(lv->lv_buf_len, KM_SLEEP|KM_NOFS);
296 ptr = lv->lv_buf;
297
298 for (index = 0; index < lv->lv_niovecs; index++) {
299 struct xfs_log_iovec *vec = &lv->lv_iovecp[index];
300
301 memcpy(ptr, vec->i_addr, vec->i_len);
302 vec->i_addr = ptr;
303 ptr += vec->i_len;
304 }
305 ASSERT(ptr == lv->lv_buf + lv->lv_buf_len);
306
307 xlog_cil_insert(log, ticket, lv->lv_item, lv);
308 }
309}
310
311static void
312xlog_cil_free_logvec(
313 struct xfs_log_vec *log_vector)
314{
315 struct xfs_log_vec *lv;
316
317 for (lv = log_vector; lv; ) {
318 struct xfs_log_vec *next = lv->lv_next;
319 kmem_free(lv->lv_buf);
320 kmem_free(lv);
321 lv = next;
322 }
323}
324
325/*
326 * Commit a transaction with the given vector to the Committed Item List.
327 *
328 * To do this, we need to format the item, pin it in memory if required and
329 * account for the space used by the transaction. Once we have done that we
330 * need to release the unused reservation for the transaction, attach the
331 * transaction to the checkpoint context so we carry the busy extents through
332 * to checkpoint completion, and then unlock all the items in the transaction.
333 *
334 * For more specific information about the order of operations in
335 * xfs_log_commit_cil() please refer to the comments in
336 * xfs_trans_commit_iclog().
337 *
338 * Called with the context lock already held in read mode to lock out
339 * background commit, returns without it held once background commits are
340 * allowed again.
341 */
342int
343xfs_log_commit_cil(
344 struct xfs_mount *mp,
345 struct xfs_trans *tp,
346 struct xfs_log_vec *log_vector,
347 xfs_lsn_t *commit_lsn,
348 int flags)
349{
350 struct log *log = mp->m_log;
351 int log_flags = 0;
352 int push = 0;
353
354 if (flags & XFS_TRANS_RELEASE_LOG_RES)
355 log_flags = XFS_LOG_REL_PERM_RESERV;
356
357 if (XLOG_FORCED_SHUTDOWN(log)) {
358 xlog_cil_free_logvec(log_vector);
359 return XFS_ERROR(EIO);
360 }
361
362 /* lock out background commit */
363 down_read(&log->l_cilp->xc_ctx_lock);
364 xlog_cil_format_items(log, log_vector, tp->t_ticket, commit_lsn);
365
366 /* check we didn't blow the reservation */
367 if (tp->t_ticket->t_curr_res < 0)
368 xlog_print_tic_res(log->l_mp, tp->t_ticket);
369
370 /* attach the transaction to the CIL if it has any busy extents */
371 if (!list_empty(&tp->t_busy)) {
372 spin_lock(&log->l_cilp->xc_cil_lock);
373 list_splice_init(&tp->t_busy,
374 &log->l_cilp->xc_ctx->busy_extents);
375 spin_unlock(&log->l_cilp->xc_cil_lock);
376 }
377
378 tp->t_commit_lsn = *commit_lsn;
379 xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
380 xfs_trans_unreserve_and_mod_sb(tp);
381
382 /* check for background commit before unlock */
383 if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log))
384 push = 1;
385 up_read(&log->l_cilp->xc_ctx_lock);
386
387 /*
388 * We need to push CIL every so often so we don't cache more than we
389 * can fit in the log. The limit really is that a checkpoint can't be
390 * more than half the log (the current checkpoint is not allowed to
391 * overwrite the previous checkpoint), but commit latency and memory
392 * usage limit this to a smaller size in most cases.
393 */
394 if (push)
395 xlog_cil_push(log, 0);
396 return 0;
397}
398
399/*
400 * Mark all items committed and clear busy extents. We free the log vector
401 * chains in a separate pass so that we unpin the log items as quickly as
402 * possible.
403 */
404static void
405xlog_cil_committed(
406 void *args,
407 int abort)
408{
409 struct xfs_cil_ctx *ctx = args;
410 struct xfs_log_vec *lv;
411 int abortflag = abort ? XFS_LI_ABORTED : 0;
412 struct xfs_busy_extent *busyp, *n;
413
414 /* unpin all the log items */
415 for (lv = ctx->lv_chain; lv; lv = lv->lv_next ) {
416 xfs_trans_item_committed(lv->lv_item, ctx->start_lsn,
417 abortflag);
418 }
419
420 list_for_each_entry_safe(busyp, n, &ctx->busy_extents, list)
421 xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, busyp);
422
423 spin_lock(&ctx->cil->xc_cil_lock);
424 list_del(&ctx->committing);
425 spin_unlock(&ctx->cil->xc_cil_lock);
426
427 xlog_cil_free_logvec(ctx->lv_chain);
428 kmem_free(ctx);
429}
430
431/*
432 * Push the Committed Item List to the log. If the push_now flag is not set,
433 * then it is a background flush and so we can chose to ignore it.
434 */
435int
436xlog_cil_push(
437 struct log *log,
438 int push_now)
439{
440 struct xfs_cil *cil = log->l_cilp;
441 struct xfs_log_vec *lv;
442 struct xfs_cil_ctx *ctx;
443 struct xfs_cil_ctx *new_ctx;
444 struct xlog_in_core *commit_iclog;
445 struct xlog_ticket *tic;
446 int num_lv;
447 int num_iovecs;
448 int len;
449 int error = 0;
450 struct xfs_trans_header thdr;
451 struct xfs_log_iovec lhdr;
452 struct xfs_log_vec lvhdr = { NULL };
453 xfs_lsn_t commit_lsn;
454
455 if (!cil)
456 return 0;
457
458 new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS);
459 new_ctx->ticket = xlog_cil_ticket_alloc(log);
460
461 /* lock out transaction commit, but don't block on background push */
462 if (!down_write_trylock(&cil->xc_ctx_lock)) {
463 if (!push_now)
464 goto out_free_ticket;
465 down_write(&cil->xc_ctx_lock);
466 }
467 ctx = cil->xc_ctx;
468
469 /* check if we've anything to push */
470 if (list_empty(&cil->xc_cil))
471 goto out_skip;
472
473 /* check for spurious background flush */
474 if (!push_now && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
475 goto out_skip;
476
477 /*
478 * pull all the log vectors off the items in the CIL, and
479 * remove the items from the CIL. We don't need the CIL lock
480 * here because it's only needed on the transaction commit
481 * side which is currently locked out by the flush lock.
482 */
483 lv = NULL;
484 num_lv = 0;
485 num_iovecs = 0;
486 len = 0;
487 while (!list_empty(&cil->xc_cil)) {
488 struct xfs_log_item *item;
489 int i;
490
491 item = list_first_entry(&cil->xc_cil,
492 struct xfs_log_item, li_cil);
493 list_del_init(&item->li_cil);
494 if (!ctx->lv_chain)
495 ctx->lv_chain = item->li_lv;
496 else
497 lv->lv_next = item->li_lv;
498 lv = item->li_lv;
499 item->li_lv = NULL;
500
501 num_lv++;
502 num_iovecs += lv->lv_niovecs;
503 for (i = 0; i < lv->lv_niovecs; i++)
504 len += lv->lv_iovecp[i].i_len;
505 }
506
507 /*
508 * initialise the new context and attach it to the CIL. Then attach
509 * the current context to the CIL committing lsit so it can be found
510 * during log forces to extract the commit lsn of the sequence that
511 * needs to be forced.
512 */
513 INIT_LIST_HEAD(&new_ctx->committing);
514 INIT_LIST_HEAD(&new_ctx->busy_extents);
515 new_ctx->sequence = ctx->sequence + 1;
516 new_ctx->cil = cil;
517 cil->xc_ctx = new_ctx;
518
519 /*
520 * The switch is now done, so we can drop the context lock and move out
521 * of a shared context. We can't just go straight to the commit record,
522 * though - we need to synchronise with previous and future commits so
523 * that the commit records are correctly ordered in the log to ensure
524 * that we process items during log IO completion in the correct order.
525 *
526 * For example, if we get an EFI in one checkpoint and the EFD in the
527 * next (e.g. due to log forces), we do not want the checkpoint with
528 * the EFD to be committed before the checkpoint with the EFI. Hence
529 * we must strictly order the commit records of the checkpoints so
530 * that: a) the checkpoint callbacks are attached to the iclogs in the
531 * correct order; and b) the checkpoints are replayed in correct order
532 * in log recovery.
533 *
534 * Hence we need to add this context to the committing context list so
535 * that higher sequences will wait for us to write out a commit record
536 * before they do.
537 */
538 spin_lock(&cil->xc_cil_lock);
539 list_add(&ctx->committing, &cil->xc_committing);
540 spin_unlock(&cil->xc_cil_lock);
541 up_write(&cil->xc_ctx_lock);
542
543 /*
544 * Build a checkpoint transaction header and write it to the log to
545 * begin the transaction. We need to account for the space used by the
546 * transaction header here as it is not accounted for in xlog_write().
547 *
548 * The LSN we need to pass to the log items on transaction commit is
549 * the LSN reported by the first log vector write. If we use the commit
550 * record lsn then we can move the tail beyond the grant write head.
551 */
552 tic = ctx->ticket;
553 thdr.th_magic = XFS_TRANS_HEADER_MAGIC;
554 thdr.th_type = XFS_TRANS_CHECKPOINT;
555 thdr.th_tid = tic->t_tid;
556 thdr.th_num_items = num_iovecs;
557 lhdr.i_addr = (xfs_caddr_t)&thdr;
558 lhdr.i_len = sizeof(xfs_trans_header_t);
559 lhdr.i_type = XLOG_REG_TYPE_TRANSHDR;
560 tic->t_curr_res -= lhdr.i_len + sizeof(xlog_op_header_t);
561
562 lvhdr.lv_niovecs = 1;
563 lvhdr.lv_iovecp = &lhdr;
564 lvhdr.lv_next = ctx->lv_chain;
565
566 error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0);
567 if (error)
568 goto out_abort;
569
570 /*
571 * now that we've written the checkpoint into the log, strictly
572 * order the commit records so replay will get them in the right order.
573 */
574restart:
575 spin_lock(&cil->xc_cil_lock);
576 list_for_each_entry(new_ctx, &cil->xc_committing, committing) {
577 /*
578 * Higher sequences will wait for this one so skip them.
579 * Don't wait for own own sequence, either.
580 */
581 if (new_ctx->sequence >= ctx->sequence)
582 continue;
583 if (!new_ctx->commit_lsn) {
584 /*
585 * It is still being pushed! Wait for the push to
586 * complete, then start again from the beginning.
587 */
588 sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
589 goto restart;
590 }
591 }
592 spin_unlock(&cil->xc_cil_lock);
593
594 commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0);
595 if (error || commit_lsn == -1)
596 goto out_abort;
597
598 /* attach all the transactions w/ busy extents to iclog */
599 ctx->log_cb.cb_func = xlog_cil_committed;
600 ctx->log_cb.cb_arg = ctx;
601 error = xfs_log_notify(log->l_mp, commit_iclog, &ctx->log_cb);
602 if (error)
603 goto out_abort;
604
605 /*
606 * now the checkpoint commit is complete and we've attached the
607 * callbacks to the iclog we can assign the commit LSN to the context
608 * and wake up anyone who is waiting for the commit to complete.
609 */
610 spin_lock(&cil->xc_cil_lock);
611 ctx->commit_lsn = commit_lsn;
612 sv_broadcast(&cil->xc_commit_wait);
613 spin_unlock(&cil->xc_cil_lock);
614
615 /* release the hounds! */
616 return xfs_log_release_iclog(log->l_mp, commit_iclog);
617
618out_skip:
619 up_write(&cil->xc_ctx_lock);
620out_free_ticket:
621 xfs_log_ticket_put(new_ctx->ticket);
622 kmem_free(new_ctx);
623 return 0;
624
625out_abort:
626 xlog_cil_committed(ctx, XFS_LI_ABORTED);
627 return XFS_ERROR(EIO);
628}
629
630/*
631 * Conditionally push the CIL based on the sequence passed in.
632 *
633 * We only need to push if we haven't already pushed the sequence
634 * number given. Hence the only time we will trigger a push here is
635 * if the push sequence is the same as the current context.
636 *
637 * We return the current commit lsn to allow the callers to determine if a
638 * iclog flush is necessary following this call.
639 *
640 * XXX: Initially, just push the CIL unconditionally and return whatever
641 * commit lsn is there. It'll be empty, so this is broken for now.
642 */
643xfs_lsn_t
644xlog_cil_push_lsn(
645 struct log *log,
646 xfs_lsn_t push_seq)
647{
648 struct xfs_cil *cil = log->l_cilp;
649 struct xfs_cil_ctx *ctx;
650 xfs_lsn_t commit_lsn = NULLCOMMITLSN;
651
652restart:
653 down_write(&cil->xc_ctx_lock);
654 ASSERT(push_seq <= cil->xc_ctx->sequence);
655
656 /* check to see if we need to force out the current context */
657 if (push_seq == cil->xc_ctx->sequence) {
658 up_write(&cil->xc_ctx_lock);
659 xlog_cil_push(log, 1);
660 goto restart;
661 }
662
663 /*
664 * See if we can find a previous sequence still committing.
665 * We can drop the flush lock as soon as we have the cil lock
666 * because we are now only comparing contexts protected by
667 * the cil lock.
668 *
669 * We need to wait for all previous sequence commits to complete
670 * before allowing the force of push_seq to go ahead. Hence block
671 * on commits for those as well.
672 */
673 spin_lock(&cil->xc_cil_lock);
674 up_write(&cil->xc_ctx_lock);
675 list_for_each_entry(ctx, &cil->xc_committing, committing) {
676 if (ctx->sequence > push_seq)
677 continue;
678 if (!ctx->commit_lsn) {
679 /*
680 * It is still being pushed! Wait for the push to
681 * complete, then start again from the beginning.
682 */
683 sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
684 goto restart;
685 }
686 if (ctx->sequence != push_seq)
687 continue;
688 /* found it! */
689 commit_lsn = ctx->commit_lsn;
690 }
691 spin_unlock(&cil->xc_cil_lock);
692 return commit_lsn;
693}
694
695/*
696 * Check if the current log item was first committed in this sequence.
697 * We can't rely on just the log item being in the CIL, we have to check
698 * the recorded commit sequence number.
699 *
700 * Note: for this to be used in a non-racy manner, it has to be called with
701 * CIL flushing locked out. As a result, it should only be used during the
702 * transaction commit process when deciding what to format into the item.
703 */
704bool
705xfs_log_item_in_current_chkpt(
706 struct xfs_log_item *lip)
707{
708 struct xfs_cil_ctx *ctx;
709
710 if (!(lip->li_mountp->m_flags & XFS_MOUNT_DELAYLOG))
711 return false;
712 if (list_empty(&lip->li_cil))
713 return false;
714
715 ctx = lip->li_mountp->m_log->l_cilp->xc_ctx;
716
717 /*
718 * li_seq is written on the first commit of a log item to record the
719 * first checkpoint it is written to. Hence if it is different to the
720 * current sequence, we're in a new checkpoint.
721 */
722 if (XFS_LSN_CMP(lip->li_seq, ctx->sequence) != 0)
723 return false;
724 return true;
725}
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index fd02a18facd5..8c072618965c 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -152,8 +152,6 @@ static inline uint xlog_get_client_id(__be32 i)
152#define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */ 152#define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */
153#define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being 153#define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being
154 shutdown */ 154 shutdown */
155typedef __uint32_t xlog_tid_t;
156
157 155
158#ifdef __KERNEL__ 156#ifdef __KERNEL__
159/* 157/*
@@ -379,6 +377,99 @@ typedef struct xlog_in_core {
379} xlog_in_core_t; 377} xlog_in_core_t;
380 378
381/* 379/*
380 * The CIL context is used to aggregate per-transaction details as well be
381 * passed to the iclog for checkpoint post-commit processing. After being
382 * passed to the iclog, another context needs to be allocated for tracking the
383 * next set of transactions to be aggregated into a checkpoint.
384 */
385struct xfs_cil;
386
387struct xfs_cil_ctx {
388 struct xfs_cil *cil;
389 xfs_lsn_t sequence; /* chkpt sequence # */
390 xfs_lsn_t start_lsn; /* first LSN of chkpt commit */
391 xfs_lsn_t commit_lsn; /* chkpt commit record lsn */
392 struct xlog_ticket *ticket; /* chkpt ticket */
393 int nvecs; /* number of regions */
394 int space_used; /* aggregate size of regions */
395 struct list_head busy_extents; /* busy extents in chkpt */
396 struct xfs_log_vec *lv_chain; /* logvecs being pushed */
397 xfs_log_callback_t log_cb; /* completion callback hook. */
398 struct list_head committing; /* ctx committing list */
399};
400
401/*
402 * Committed Item List structure
403 *
404 * This structure is used to track log items that have been committed but not
405 * yet written into the log. It is used only when the delayed logging mount
406 * option is enabled.
407 *
408 * This structure tracks the list of committing checkpoint contexts so
409 * we can avoid the problem of having to hold out new transactions during a
410 * flush until we have a the commit record LSN of the checkpoint. We can
411 * traverse the list of committing contexts in xlog_cil_push_lsn() to find a
412 * sequence match and extract the commit LSN directly from there. If the
413 * checkpoint is still in the process of committing, we can block waiting for
414 * the commit LSN to be determined as well. This should make synchronous
415 * operations almost as efficient as the old logging methods.
416 */
417struct xfs_cil {
418 struct log *xc_log;
419 struct list_head xc_cil;
420 spinlock_t xc_cil_lock;
421 struct xfs_cil_ctx *xc_ctx;
422 struct rw_semaphore xc_ctx_lock;
423 struct list_head xc_committing;
424 sv_t xc_commit_wait;
425};
426
427/*
428 * The amount of log space we should the CIL to aggregate is difficult to size.
429 * Whatever we chose we have to make we can get a reservation for the log space
430 * effectively, that it is large enough to capture sufficient relogging to
431 * reduce log buffer IO significantly, but it is not too large for the log or
432 * induces too much latency when writing out through the iclogs. We track both
433 * space consumed and the number of vectors in the checkpoint context, so we
434 * need to decide which to use for limiting.
435 *
436 * Every log buffer we write out during a push needs a header reserved, which
437 * is at least one sector and more for v2 logs. Hence we need a reservation of
438 * at least 512 bytes per 32k of log space just for the LR headers. That means
439 * 16KB of reservation per megabyte of delayed logging space we will consume,
440 * plus various headers. The number of headers will vary based on the num of
441 * io vectors, so limiting on a specific number of vectors is going to result
442 * in transactions of varying size. IOWs, it is more consistent to track and
443 * limit space consumed in the log rather than by the number of objects being
444 * logged in order to prevent checkpoint ticket overruns.
445 *
446 * Further, use of static reservations through the log grant mechanism is
447 * problematic. It introduces a lot of complexity (e.g. reserve grant vs write
448 * grant) and a significant deadlock potential because regranting write space
449 * can block on log pushes. Hence if we have to regrant log space during a log
450 * push, we can deadlock.
451 *
452 * However, we can avoid this by use of a dynamic "reservation stealing"
453 * technique during transaction commit whereby unused reservation space in the
454 * transaction ticket is transferred to the CIL ctx commit ticket to cover the
455 * space needed by the checkpoint transaction. This means that we never need to
456 * specifically reserve space for the CIL checkpoint transaction, nor do we
457 * need to regrant space once the checkpoint completes. This also means the
458 * checkpoint transaction ticket is specific to the checkpoint context, rather
459 * than the CIL itself.
460 *
461 * With dynamic reservations, we can basically make up arbitrary limits for the
462 * checkpoint size so long as they don't violate any other size rules. Hence
463 * the initial maximum size for the checkpoint transaction will be set to a
464 * quarter of the log or 8MB, which ever is smaller. 8MB is an arbitrary limit
465 * right now based on the latency of writing out a large amount of data through
466 * the circular iclog buffers.
467 */
468
469#define XLOG_CIL_SPACE_LIMIT(log) \
470 (min((log->l_logsize >> 2), (8 * 1024 * 1024)))
471
472/*
382 * The reservation head lsn is not made up of a cycle number and block number. 473 * The reservation head lsn is not made up of a cycle number and block number.
383 * Instead, it uses a cycle number and byte number. Logs don't expect to 474 * Instead, it uses a cycle number and byte number. Logs don't expect to
384 * overflow 31 bits worth of byte offset, so using a byte number will mean 475 * overflow 31 bits worth of byte offset, so using a byte number will mean
@@ -388,6 +479,7 @@ typedef struct log {
388 /* The following fields don't need locking */ 479 /* The following fields don't need locking */
389 struct xfs_mount *l_mp; /* mount point */ 480 struct xfs_mount *l_mp; /* mount point */
390 struct xfs_ail *l_ailp; /* AIL log is working with */ 481 struct xfs_ail *l_ailp; /* AIL log is working with */
482 struct xfs_cil *l_cilp; /* CIL log is working with */
391 struct xfs_buf *l_xbuf; /* extra buffer for log 483 struct xfs_buf *l_xbuf; /* extra buffer for log
392 * wrapping */ 484 * wrapping */
393 struct xfs_buftarg *l_targ; /* buftarg of log */ 485 struct xfs_buftarg *l_targ; /* buftarg of log */
@@ -396,9 +488,7 @@ typedef struct log {
396 struct xfs_buf_cancel **l_buf_cancel_table; 488 struct xfs_buf_cancel **l_buf_cancel_table;
397 int l_iclog_hsize; /* size of iclog header */ 489 int l_iclog_hsize; /* size of iclog header */
398 int l_iclog_heads; /* # of iclog header sectors */ 490 int l_iclog_heads; /* # of iclog header sectors */
399 uint l_sectbb_log; /* log2 of sector size in BBs */ 491 uint l_sectBBsize; /* sector size in BBs (2^n) */
400 uint l_sectbb_mask; /* sector size (in BBs)
401 * alignment mask */
402 int l_iclog_size; /* size of log in bytes */ 492 int l_iclog_size; /* size of log in bytes */
403 int l_iclog_size_log; /* log power size of log */ 493 int l_iclog_size_log; /* log power size of log */
404 int l_iclog_bufs; /* number of iclog buffers */ 494 int l_iclog_bufs; /* number of iclog buffers */
@@ -440,14 +530,40 @@ typedef struct log {
440 530
441#define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR) 531#define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR)
442 532
443
444/* common routines */ 533/* common routines */
445extern xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp); 534extern xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp);
446extern int xlog_recover(xlog_t *log); 535extern int xlog_recover(xlog_t *log);
447extern int xlog_recover_finish(xlog_t *log); 536extern int xlog_recover_finish(xlog_t *log);
448extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int); 537extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int);
449 538
450extern kmem_zone_t *xfs_log_ticket_zone; 539extern kmem_zone_t *xfs_log_ticket_zone;
540struct xlog_ticket *xlog_ticket_alloc(struct log *log, int unit_bytes,
541 int count, char client, uint xflags,
542 int alloc_flags);
543
544
545static inline void
546xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes)
547{
548 *ptr += bytes;
549 *len -= bytes;
550 *off += bytes;
551}
552
553void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket);
554int xlog_write(struct log *log, struct xfs_log_vec *log_vector,
555 struct xlog_ticket *tic, xfs_lsn_t *start_lsn,
556 xlog_in_core_t **commit_iclog, uint flags);
557
558/*
559 * Committed Item List interfaces
560 */
561int xlog_cil_init(struct log *log);
562void xlog_cil_init_post_recovery(struct log *log);
563void xlog_cil_destroy(struct log *log);
564
565int xlog_cil_push(struct log *log, int push_now);
566xfs_lsn_t xlog_cil_push_lsn(struct log *log, xfs_lsn_t push_sequence);
451 567
452/* 568/*
453 * Unmount record type is used as a pseudo transaction type for the ticket. 569 * Unmount record type is used as a pseudo transaction type for the ticket.
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 22e6efdc17ea..14a69aec2c0b 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -56,33 +56,61 @@ STATIC void xlog_recover_check_summary(xlog_t *);
56#define xlog_recover_check_summary(log) 56#define xlog_recover_check_summary(log)
57#endif 57#endif
58 58
59
60/* 59/*
61 * Sector aligned buffer routines for buffer create/read/write/access 60 * Sector aligned buffer routines for buffer create/read/write/access
62 */ 61 */
63 62
64#define XLOG_SECTOR_ROUNDUP_BBCOUNT(log, bbs) \ 63/*
65 ( ((log)->l_sectbb_mask && (bbs & (log)->l_sectbb_mask)) ? \ 64 * Verify the given count of basic blocks is valid number of blocks
66 ((bbs + (log)->l_sectbb_mask + 1) & ~(log)->l_sectbb_mask) : (bbs) ) 65 * to specify for an operation involving the given XFS log buffer.
67#define XLOG_SECTOR_ROUNDDOWN_BLKNO(log, bno) ((bno) & ~(log)->l_sectbb_mask) 66 * Returns nonzero if the count is valid, 0 otherwise.
67 */
68 68
69static inline int
70xlog_buf_bbcount_valid(
71 xlog_t *log,
72 int bbcount)
73{
74 return bbcount > 0 && bbcount <= log->l_logBBsize;
75}
76
77/*
78 * Allocate a buffer to hold log data. The buffer needs to be able
79 * to map to a range of nbblks basic blocks at any valid (basic
80 * block) offset within the log.
81 */
69STATIC xfs_buf_t * 82STATIC xfs_buf_t *
70xlog_get_bp( 83xlog_get_bp(
71 xlog_t *log, 84 xlog_t *log,
72 int nbblks) 85 int nbblks)
73{ 86{
74 if (nbblks <= 0 || nbblks > log->l_logBBsize) { 87 if (!xlog_buf_bbcount_valid(log, nbblks)) {
75 xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks); 88 xlog_warn("XFS: Invalid block length (0x%x) given for buffer",
76 XFS_ERROR_REPORT("xlog_get_bp(1)", 89 nbblks);
77 XFS_ERRLEVEL_HIGH, log->l_mp); 90 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
78 return NULL; 91 return NULL;
79 } 92 }
80 93
81 if (log->l_sectbb_log) { 94 /*
82 if (nbblks > 1) 95 * We do log I/O in units of log sectors (a power-of-2
83 nbblks += XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1); 96 * multiple of the basic block size), so we round up the
84 nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks); 97 * requested size to acommodate the basic blocks required
85 } 98 * for complete log sectors.
99 *
100 * In addition, the buffer may be used for a non-sector-
101 * aligned block offset, in which case an I/O of the
102 * requested size could extend beyond the end of the
103 * buffer. If the requested size is only 1 basic block it
104 * will never straddle a sector boundary, so this won't be
105 * an issue. Nor will this be a problem if the log I/O is
106 * done in basic blocks (sector size 1). But otherwise we
107 * extend the buffer by one extra log sector to ensure
108 * there's space to accomodate this possiblility.
109 */
110 if (nbblks > 1 && log->l_sectBBsize > 1)
111 nbblks += log->l_sectBBsize;
112 nbblks = round_up(nbblks, log->l_sectBBsize);
113
86 return xfs_buf_get_noaddr(BBTOB(nbblks), log->l_mp->m_logdev_targp); 114 return xfs_buf_get_noaddr(BBTOB(nbblks), log->l_mp->m_logdev_targp);
87} 115}
88 116
@@ -93,6 +121,10 @@ xlog_put_bp(
93 xfs_buf_free(bp); 121 xfs_buf_free(bp);
94} 122}
95 123
124/*
125 * Return the address of the start of the given block number's data
126 * in a log buffer. The buffer covers a log sector-aligned region.
127 */
96STATIC xfs_caddr_t 128STATIC xfs_caddr_t
97xlog_align( 129xlog_align(
98 xlog_t *log, 130 xlog_t *log,
@@ -100,14 +132,14 @@ xlog_align(
100 int nbblks, 132 int nbblks,
101 xfs_buf_t *bp) 133 xfs_buf_t *bp)
102{ 134{
135 xfs_daddr_t offset;
103 xfs_caddr_t ptr; 136 xfs_caddr_t ptr;
104 137
105 if (!log->l_sectbb_log) 138 offset = blk_no & ((xfs_daddr_t) log->l_sectBBsize - 1);
106 return XFS_BUF_PTR(bp); 139 ptr = XFS_BUF_PTR(bp) + BBTOB(offset);
140
141 ASSERT(ptr + BBTOB(nbblks) <= XFS_BUF_PTR(bp) + XFS_BUF_SIZE(bp));
107 142
108 ptr = XFS_BUF_PTR(bp) + BBTOB((int)blk_no & log->l_sectbb_mask);
109 ASSERT(XFS_BUF_SIZE(bp) >=
110 BBTOB(nbblks + (blk_no & log->l_sectbb_mask)));
111 return ptr; 143 return ptr;
112} 144}
113 145
@@ -124,21 +156,18 @@ xlog_bread_noalign(
124{ 156{
125 int error; 157 int error;
126 158
127 if (nbblks <= 0 || nbblks > log->l_logBBsize) { 159 if (!xlog_buf_bbcount_valid(log, nbblks)) {
128 xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks); 160 xlog_warn("XFS: Invalid block length (0x%x) given for buffer",
129 XFS_ERROR_REPORT("xlog_bread(1)", 161 nbblks);
130 XFS_ERRLEVEL_HIGH, log->l_mp); 162 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
131 return EFSCORRUPTED; 163 return EFSCORRUPTED;
132 } 164 }
133 165
134 if (log->l_sectbb_log) { 166 blk_no = round_down(blk_no, log->l_sectBBsize);
135 blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no); 167 nbblks = round_up(nbblks, log->l_sectBBsize);
136 nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
137 }
138 168
139 ASSERT(nbblks > 0); 169 ASSERT(nbblks > 0);
140 ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp)); 170 ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
141 ASSERT(bp);
142 171
143 XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no); 172 XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
144 XFS_BUF_READ(bp); 173 XFS_BUF_READ(bp);
@@ -186,17 +215,15 @@ xlog_bwrite(
186{ 215{
187 int error; 216 int error;
188 217
189 if (nbblks <= 0 || nbblks > log->l_logBBsize) { 218 if (!xlog_buf_bbcount_valid(log, nbblks)) {
190 xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks); 219 xlog_warn("XFS: Invalid block length (0x%x) given for buffer",
191 XFS_ERROR_REPORT("xlog_bwrite(1)", 220 nbblks);
192 XFS_ERRLEVEL_HIGH, log->l_mp); 221 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
193 return EFSCORRUPTED; 222 return EFSCORRUPTED;
194 } 223 }
195 224
196 if (log->l_sectbb_log) { 225 blk_no = round_down(blk_no, log->l_sectBBsize);
197 blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no); 226 nbblks = round_up(nbblks, log->l_sectBBsize);
198 nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
199 }
200 227
201 ASSERT(nbblks > 0); 228 ASSERT(nbblks > 0);
202 ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp)); 229 ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
@@ -327,39 +354,38 @@ xlog_find_cycle_start(
327{ 354{
328 xfs_caddr_t offset; 355 xfs_caddr_t offset;
329 xfs_daddr_t mid_blk; 356 xfs_daddr_t mid_blk;
357 xfs_daddr_t end_blk;
330 uint mid_cycle; 358 uint mid_cycle;
331 int error; 359 int error;
332 360
333 mid_blk = BLK_AVG(first_blk, *last_blk); 361 end_blk = *last_blk;
334 while (mid_blk != first_blk && mid_blk != *last_blk) { 362 mid_blk = BLK_AVG(first_blk, end_blk);
363 while (mid_blk != first_blk && mid_blk != end_blk) {
335 error = xlog_bread(log, mid_blk, 1, bp, &offset); 364 error = xlog_bread(log, mid_blk, 1, bp, &offset);
336 if (error) 365 if (error)
337 return error; 366 return error;
338 mid_cycle = xlog_get_cycle(offset); 367 mid_cycle = xlog_get_cycle(offset);
339 if (mid_cycle == cycle) { 368 if (mid_cycle == cycle)
340 *last_blk = mid_blk; 369 end_blk = mid_blk; /* last_half_cycle == mid_cycle */
341 /* last_half_cycle == mid_cycle */ 370 else
342 } else { 371 first_blk = mid_blk; /* first_half_cycle == mid_cycle */
343 first_blk = mid_blk; 372 mid_blk = BLK_AVG(first_blk, end_blk);
344 /* first_half_cycle == mid_cycle */
345 }
346 mid_blk = BLK_AVG(first_blk, *last_blk);
347 } 373 }
348 ASSERT((mid_blk == first_blk && mid_blk+1 == *last_blk) || 374 ASSERT((mid_blk == first_blk && mid_blk+1 == end_blk) ||
349 (mid_blk == *last_blk && mid_blk-1 == first_blk)); 375 (mid_blk == end_blk && mid_blk-1 == first_blk));
376
377 *last_blk = end_blk;
350 378
351 return 0; 379 return 0;
352} 380}
353 381
354/* 382/*
355 * Check that the range of blocks does not contain the cycle number 383 * Check that a range of blocks does not contain stop_on_cycle_no.
356 * given. The scan needs to occur from front to back and the ptr into the 384 * Fill in *new_blk with the block offset where such a block is
357 * region must be updated since a later routine will need to perform another 385 * found, or with -1 (an invalid block number) if there is no such
358 * test. If the region is completely good, we end up returning the same 386 * block in the range. The scan needs to occur from front to back
359 * last block number. 387 * and the pointer into the region must be updated since a later
360 * 388 * routine will need to perform another test.
361 * Set blkno to -1 if we encounter no errors. This is an invalid block number
362 * since we don't ever expect logs to get this large.
363 */ 389 */
364STATIC int 390STATIC int
365xlog_find_verify_cycle( 391xlog_find_verify_cycle(
@@ -376,12 +402,16 @@ xlog_find_verify_cycle(
376 xfs_caddr_t buf = NULL; 402 xfs_caddr_t buf = NULL;
377 int error = 0; 403 int error = 0;
378 404
405 /*
406 * Greedily allocate a buffer big enough to handle the full
407 * range of basic blocks we'll be examining. If that fails,
408 * try a smaller size. We need to be able to read at least
409 * a log sector, or we're out of luck.
410 */
379 bufblks = 1 << ffs(nbblks); 411 bufblks = 1 << ffs(nbblks);
380
381 while (!(bp = xlog_get_bp(log, bufblks))) { 412 while (!(bp = xlog_get_bp(log, bufblks))) {
382 /* can't get enough memory to do everything in one big buffer */
383 bufblks >>= 1; 413 bufblks >>= 1;
384 if (bufblks <= log->l_sectbb_log) 414 if (bufblks < log->l_sectBBsize)
385 return ENOMEM; 415 return ENOMEM;
386 } 416 }
387 417
@@ -629,7 +659,7 @@ xlog_find_head(
629 * In this case we want to find the first block with cycle 659 * In this case we want to find the first block with cycle
630 * number matching last_half_cycle. We expect the log to be 660 * number matching last_half_cycle. We expect the log to be
631 * some variation on 661 * some variation on
632 * x + 1 ... | x ... 662 * x + 1 ... | x ... | x
633 * The first block with cycle number x (last_half_cycle) will 663 * The first block with cycle number x (last_half_cycle) will
634 * be where the new head belongs. First we do a binary search 664 * be where the new head belongs. First we do a binary search
635 * for the first occurrence of last_half_cycle. The binary 665 * for the first occurrence of last_half_cycle. The binary
@@ -639,11 +669,13 @@ xlog_find_head(
639 * the log, then we look for occurrences of last_half_cycle - 1 669 * the log, then we look for occurrences of last_half_cycle - 1
640 * at the end of the log. The cases we're looking for look 670 * at the end of the log. The cases we're looking for look
641 * like 671 * like
642 * x + 1 ... | x | x + 1 | x ... 672 * v binary search stopped here
643 * ^ binary search stopped here 673 * x + 1 ... | x | x + 1 | x ... | x
674 * ^ but we want to locate this spot
644 * or 675 * or
645 * x + 1 ... | x ... | x - 1 | x
646 * <---------> less than scan distance 676 * <---------> less than scan distance
677 * x + 1 ... | x ... | x - 1 | x
678 * ^ we want to locate this spot
647 */ 679 */
648 stop_on_cycle = last_half_cycle; 680 stop_on_cycle = last_half_cycle;
649 if ((error = xlog_find_cycle_start(log, bp, first_blk, 681 if ((error = xlog_find_cycle_start(log, bp, first_blk,
@@ -699,16 +731,16 @@ xlog_find_head(
699 * certainly not the head of the log. By searching for 731 * certainly not the head of the log. By searching for
700 * last_half_cycle-1 we accomplish that. 732 * last_half_cycle-1 we accomplish that.
701 */ 733 */
702 start_blk = log_bbnum - num_scan_bblks + head_blk;
703 ASSERT(head_blk <= INT_MAX && 734 ASSERT(head_blk <= INT_MAX &&
704 (xfs_daddr_t) num_scan_bblks - head_blk >= 0); 735 (xfs_daddr_t) num_scan_bblks >= head_blk);
736 start_blk = log_bbnum - (num_scan_bblks - head_blk);
705 if ((error = xlog_find_verify_cycle(log, start_blk, 737 if ((error = xlog_find_verify_cycle(log, start_blk,
706 num_scan_bblks - (int)head_blk, 738 num_scan_bblks - (int)head_blk,
707 (stop_on_cycle - 1), &new_blk))) 739 (stop_on_cycle - 1), &new_blk)))
708 goto bp_err; 740 goto bp_err;
709 if (new_blk != -1) { 741 if (new_blk != -1) {
710 head_blk = new_blk; 742 head_blk = new_blk;
711 goto bad_blk; 743 goto validate_head;
712 } 744 }
713 745
714 /* 746 /*
@@ -726,7 +758,7 @@ xlog_find_head(
726 head_blk = new_blk; 758 head_blk = new_blk;
727 } 759 }
728 760
729 bad_blk: 761validate_head:
730 /* 762 /*
731 * Now we need to make sure head_blk is not pointing to a block in 763 * Now we need to make sure head_blk is not pointing to a block in
732 * the middle of a log record. 764 * the middle of a log record.
@@ -748,7 +780,7 @@ xlog_find_head(
748 if ((error = xlog_find_verify_log_record(log, start_blk, 780 if ((error = xlog_find_verify_log_record(log, start_blk,
749 &head_blk, 0)) == -1) { 781 &head_blk, 0)) == -1) {
750 /* We hit the beginning of the log during our search */ 782 /* We hit the beginning of the log during our search */
751 start_blk = log_bbnum - num_scan_bblks + head_blk; 783 start_blk = log_bbnum - (num_scan_bblks - head_blk);
752 new_blk = log_bbnum; 784 new_blk = log_bbnum;
753 ASSERT(start_blk <= INT_MAX && 785 ASSERT(start_blk <= INT_MAX &&
754 (xfs_daddr_t) log_bbnum-start_blk >= 0); 786 (xfs_daddr_t) log_bbnum-start_blk >= 0);
@@ -833,12 +865,12 @@ xlog_find_tail(
833 if (*head_blk == 0) { /* special case */ 865 if (*head_blk == 0) { /* special case */
834 error = xlog_bread(log, 0, 1, bp, &offset); 866 error = xlog_bread(log, 0, 1, bp, &offset);
835 if (error) 867 if (error)
836 goto bread_err; 868 goto done;
837 869
838 if (xlog_get_cycle(offset) == 0) { 870 if (xlog_get_cycle(offset) == 0) {
839 *tail_blk = 0; 871 *tail_blk = 0;
840 /* leave all other log inited values alone */ 872 /* leave all other log inited values alone */
841 goto exit; 873 goto done;
842 } 874 }
843 } 875 }
844 876
@@ -849,7 +881,7 @@ xlog_find_tail(
849 for (i = (int)(*head_blk) - 1; i >= 0; i--) { 881 for (i = (int)(*head_blk) - 1; i >= 0; i--) {
850 error = xlog_bread(log, i, 1, bp, &offset); 882 error = xlog_bread(log, i, 1, bp, &offset);
851 if (error) 883 if (error)
852 goto bread_err; 884 goto done;
853 885
854 if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(*(__be32 *)offset)) { 886 if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(*(__be32 *)offset)) {
855 found = 1; 887 found = 1;
@@ -866,7 +898,7 @@ xlog_find_tail(
866 for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) { 898 for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) {
867 error = xlog_bread(log, i, 1, bp, &offset); 899 error = xlog_bread(log, i, 1, bp, &offset);
868 if (error) 900 if (error)
869 goto bread_err; 901 goto done;
870 902
871 if (XLOG_HEADER_MAGIC_NUM == 903 if (XLOG_HEADER_MAGIC_NUM ==
872 be32_to_cpu(*(__be32 *)offset)) { 904 be32_to_cpu(*(__be32 *)offset)) {
@@ -941,7 +973,7 @@ xlog_find_tail(
941 umount_data_blk = (i + hblks) % log->l_logBBsize; 973 umount_data_blk = (i + hblks) % log->l_logBBsize;
942 error = xlog_bread(log, umount_data_blk, 1, bp, &offset); 974 error = xlog_bread(log, umount_data_blk, 1, bp, &offset);
943 if (error) 975 if (error)
944 goto bread_err; 976 goto done;
945 977
946 op_head = (xlog_op_header_t *)offset; 978 op_head = (xlog_op_header_t *)offset;
947 if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) { 979 if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
@@ -987,12 +1019,10 @@ xlog_find_tail(
987 * But... if the -device- itself is readonly, just skip this. 1019 * But... if the -device- itself is readonly, just skip this.
988 * We can't recover this device anyway, so it won't matter. 1020 * We can't recover this device anyway, so it won't matter.
989 */ 1021 */
990 if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) { 1022 if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp))
991 error = xlog_clear_stale_blocks(log, tail_lsn); 1023 error = xlog_clear_stale_blocks(log, tail_lsn);
992 }
993 1024
994bread_err: 1025done:
995exit:
996 xlog_put_bp(bp); 1026 xlog_put_bp(bp);
997 1027
998 if (error) 1028 if (error)
@@ -1152,16 +1182,22 @@ xlog_write_log_records(
1152 xfs_caddr_t offset; 1182 xfs_caddr_t offset;
1153 xfs_buf_t *bp; 1183 xfs_buf_t *bp;
1154 int balign, ealign; 1184 int balign, ealign;
1155 int sectbb = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1); 1185 int sectbb = log->l_sectBBsize;
1156 int end_block = start_block + blocks; 1186 int end_block = start_block + blocks;
1157 int bufblks; 1187 int bufblks;
1158 int error = 0; 1188 int error = 0;
1159 int i, j = 0; 1189 int i, j = 0;
1160 1190
1191 /*
1192 * Greedily allocate a buffer big enough to handle the full
1193 * range of basic blocks to be written. If that fails, try
1194 * a smaller size. We need to be able to write at least a
1195 * log sector, or we're out of luck.
1196 */
1161 bufblks = 1 << ffs(blocks); 1197 bufblks = 1 << ffs(blocks);
1162 while (!(bp = xlog_get_bp(log, bufblks))) { 1198 while (!(bp = xlog_get_bp(log, bufblks))) {
1163 bufblks >>= 1; 1199 bufblks >>= 1;
1164 if (bufblks <= log->l_sectbb_log) 1200 if (bufblks < sectbb)
1165 return ENOMEM; 1201 return ENOMEM;
1166 } 1202 }
1167 1203
@@ -1169,7 +1205,7 @@ xlog_write_log_records(
1169 * the buffer in the starting sector not covered by the first 1205 * the buffer in the starting sector not covered by the first
1170 * write below. 1206 * write below.
1171 */ 1207 */
1172 balign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, start_block); 1208 balign = round_down(start_block, sectbb);
1173 if (balign != start_block) { 1209 if (balign != start_block) {
1174 error = xlog_bread_noalign(log, start_block, 1, bp); 1210 error = xlog_bread_noalign(log, start_block, 1, bp);
1175 if (error) 1211 if (error)
@@ -1188,7 +1224,7 @@ xlog_write_log_records(
1188 * the buffer in the final sector not covered by the write. 1224 * the buffer in the final sector not covered by the write.
1189 * If this is the same sector as the above read, skip it. 1225 * If this is the same sector as the above read, skip it.
1190 */ 1226 */
1191 ealign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, end_block); 1227 ealign = round_down(end_block, sectbb);
1192 if (j == 0 && (start_block + endcount > ealign)) { 1228 if (j == 0 && (start_block + endcount > ealign)) {
1193 offset = XFS_BUF_PTR(bp); 1229 offset = XFS_BUF_PTR(bp);
1194 balign = BBTOB(ealign - start_block); 1230 balign = BBTOB(ealign - start_block);
@@ -1408,6 +1444,7 @@ xlog_recover_add_item(
1408 1444
1409STATIC int 1445STATIC int
1410xlog_recover_add_to_cont_trans( 1446xlog_recover_add_to_cont_trans(
1447 struct log *log,
1411 xlog_recover_t *trans, 1448 xlog_recover_t *trans,
1412 xfs_caddr_t dp, 1449 xfs_caddr_t dp,
1413 int len) 1450 int len)
@@ -1434,6 +1471,7 @@ xlog_recover_add_to_cont_trans(
1434 memcpy(&ptr[old_len], dp, len); /* d, s, l */ 1471 memcpy(&ptr[old_len], dp, len); /* d, s, l */
1435 item->ri_buf[item->ri_cnt-1].i_len += len; 1472 item->ri_buf[item->ri_cnt-1].i_len += len;
1436 item->ri_buf[item->ri_cnt-1].i_addr = ptr; 1473 item->ri_buf[item->ri_cnt-1].i_addr = ptr;
1474 trace_xfs_log_recover_item_add_cont(log, trans, item, 0);
1437 return 0; 1475 return 0;
1438} 1476}
1439 1477
@@ -1452,6 +1490,7 @@ xlog_recover_add_to_cont_trans(
1452 */ 1490 */
1453STATIC int 1491STATIC int
1454xlog_recover_add_to_trans( 1492xlog_recover_add_to_trans(
1493 struct log *log,
1455 xlog_recover_t *trans, 1494 xlog_recover_t *trans,
1456 xfs_caddr_t dp, 1495 xfs_caddr_t dp,
1457 int len) 1496 int len)
@@ -1510,6 +1549,7 @@ xlog_recover_add_to_trans(
1510 item->ri_buf[item->ri_cnt].i_addr = ptr; 1549 item->ri_buf[item->ri_cnt].i_addr = ptr;
1511 item->ri_buf[item->ri_cnt].i_len = len; 1550 item->ri_buf[item->ri_cnt].i_len = len;
1512 item->ri_cnt++; 1551 item->ri_cnt++;
1552 trace_xfs_log_recover_item_add(log, trans, item, 0);
1513 return 0; 1553 return 0;
1514} 1554}
1515 1555
@@ -1521,7 +1561,9 @@ xlog_recover_add_to_trans(
1521 */ 1561 */
1522STATIC int 1562STATIC int
1523xlog_recover_reorder_trans( 1563xlog_recover_reorder_trans(
1524 xlog_recover_t *trans) 1564 struct log *log,
1565 xlog_recover_t *trans,
1566 int pass)
1525{ 1567{
1526 xlog_recover_item_t *item, *n; 1568 xlog_recover_item_t *item, *n;
1527 LIST_HEAD(sort_list); 1569 LIST_HEAD(sort_list);
@@ -1534,7 +1576,9 @@ xlog_recover_reorder_trans(
1534 1576
1535 switch (ITEM_TYPE(item)) { 1577 switch (ITEM_TYPE(item)) {
1536 case XFS_LI_BUF: 1578 case XFS_LI_BUF:
1537 if (!(buf_f->blf_flags & XFS_BLI_CANCEL)) { 1579 if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {
1580 trace_xfs_log_recover_item_reorder_head(log,
1581 trans, item, pass);
1538 list_move(&item->ri_list, &trans->r_itemq); 1582 list_move(&item->ri_list, &trans->r_itemq);
1539 break; 1583 break;
1540 } 1584 }
@@ -1543,6 +1587,8 @@ xlog_recover_reorder_trans(
1543 case XFS_LI_QUOTAOFF: 1587 case XFS_LI_QUOTAOFF:
1544 case XFS_LI_EFD: 1588 case XFS_LI_EFD:
1545 case XFS_LI_EFI: 1589 case XFS_LI_EFI:
1590 trace_xfs_log_recover_item_reorder_tail(log,
1591 trans, item, pass);
1546 list_move_tail(&item->ri_list, &trans->r_itemq); 1592 list_move_tail(&item->ri_list, &trans->r_itemq);
1547 break; 1593 break;
1548 default: 1594 default:
@@ -1592,8 +1638,10 @@ xlog_recover_do_buffer_pass1(
1592 /* 1638 /*
1593 * If this isn't a cancel buffer item, then just return. 1639 * If this isn't a cancel buffer item, then just return.
1594 */ 1640 */
1595 if (!(flags & XFS_BLI_CANCEL)) 1641 if (!(flags & XFS_BLF_CANCEL)) {
1642 trace_xfs_log_recover_buf_not_cancel(log, buf_f);
1596 return; 1643 return;
1644 }
1597 1645
1598 /* 1646 /*
1599 * Insert an xfs_buf_cancel record into the hash table of 1647 * Insert an xfs_buf_cancel record into the hash table of
@@ -1627,6 +1675,7 @@ xlog_recover_do_buffer_pass1(
1627 while (nextp != NULL) { 1675 while (nextp != NULL) {
1628 if (nextp->bc_blkno == blkno && nextp->bc_len == len) { 1676 if (nextp->bc_blkno == blkno && nextp->bc_len == len) {
1629 nextp->bc_refcount++; 1677 nextp->bc_refcount++;
1678 trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f);
1630 return; 1679 return;
1631 } 1680 }
1632 prevp = nextp; 1681 prevp = nextp;
@@ -1640,13 +1689,14 @@ xlog_recover_do_buffer_pass1(
1640 bcp->bc_refcount = 1; 1689 bcp->bc_refcount = 1;
1641 bcp->bc_next = NULL; 1690 bcp->bc_next = NULL;
1642 prevp->bc_next = bcp; 1691 prevp->bc_next = bcp;
1692 trace_xfs_log_recover_buf_cancel_add(log, buf_f);
1643} 1693}
1644 1694
1645/* 1695/*
1646 * Check to see whether the buffer being recovered has a corresponding 1696 * Check to see whether the buffer being recovered has a corresponding
1647 * entry in the buffer cancel record table. If it does then return 1 1697 * entry in the buffer cancel record table. If it does then return 1
1648 * so that it will be cancelled, otherwise return 0. If the buffer is 1698 * so that it will be cancelled, otherwise return 0. If the buffer is
1649 * actually a buffer cancel item (XFS_BLI_CANCEL is set), then decrement 1699 * actually a buffer cancel item (XFS_BLF_CANCEL is set), then decrement
1650 * the refcount on the entry in the table and remove it from the table 1700 * the refcount on the entry in the table and remove it from the table
1651 * if this is the last reference. 1701 * if this is the last reference.
1652 * 1702 *
@@ -1671,7 +1721,7 @@ xlog_check_buffer_cancelled(
1671 * There is nothing in the table built in pass one, 1721 * There is nothing in the table built in pass one,
1672 * so this buffer must not be cancelled. 1722 * so this buffer must not be cancelled.
1673 */ 1723 */
1674 ASSERT(!(flags & XFS_BLI_CANCEL)); 1724 ASSERT(!(flags & XFS_BLF_CANCEL));
1675 return 0; 1725 return 0;
1676 } 1726 }
1677 1727
@@ -1683,7 +1733,7 @@ xlog_check_buffer_cancelled(
1683 * There is no corresponding entry in the table built 1733 * There is no corresponding entry in the table built
1684 * in pass one, so this buffer has not been cancelled. 1734 * in pass one, so this buffer has not been cancelled.
1685 */ 1735 */
1686 ASSERT(!(flags & XFS_BLI_CANCEL)); 1736 ASSERT(!(flags & XFS_BLF_CANCEL));
1687 return 0; 1737 return 0;
1688 } 1738 }
1689 1739
@@ -1702,7 +1752,7 @@ xlog_check_buffer_cancelled(
1702 * one in the table and remove it if this is the 1752 * one in the table and remove it if this is the
1703 * last reference. 1753 * last reference.
1704 */ 1754 */
1705 if (flags & XFS_BLI_CANCEL) { 1755 if (flags & XFS_BLF_CANCEL) {
1706 bcp->bc_refcount--; 1756 bcp->bc_refcount--;
1707 if (bcp->bc_refcount == 0) { 1757 if (bcp->bc_refcount == 0) {
1708 if (prevp == NULL) { 1758 if (prevp == NULL) {
@@ -1722,7 +1772,7 @@ xlog_check_buffer_cancelled(
1722 * We didn't find a corresponding entry in the table, so 1772 * We didn't find a corresponding entry in the table, so
1723 * return 0 so that the buffer is NOT cancelled. 1773 * return 0 so that the buffer is NOT cancelled.
1724 */ 1774 */
1725 ASSERT(!(flags & XFS_BLI_CANCEL)); 1775 ASSERT(!(flags & XFS_BLF_CANCEL));
1726 return 0; 1776 return 0;
1727} 1777}
1728 1778
@@ -1779,6 +1829,8 @@ xlog_recover_do_inode_buffer(
1779 unsigned int *data_map = NULL; 1829 unsigned int *data_map = NULL;
1780 unsigned int map_size = 0; 1830 unsigned int map_size = 0;
1781 1831
1832 trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
1833
1782 switch (buf_f->blf_type) { 1834 switch (buf_f->blf_type) {
1783 case XFS_LI_BUF: 1835 case XFS_LI_BUF:
1784 data_map = buf_f->blf_data_map; 1836 data_map = buf_f->blf_data_map;
@@ -1822,8 +1874,8 @@ xlog_recover_do_inode_buffer(
1822 nbits = xfs_contig_bits(data_map, map_size, 1874 nbits = xfs_contig_bits(data_map, map_size,
1823 bit); 1875 bit);
1824 ASSERT(nbits > 0); 1876 ASSERT(nbits > 0);
1825 reg_buf_offset = bit << XFS_BLI_SHIFT; 1877 reg_buf_offset = bit << XFS_BLF_SHIFT;
1826 reg_buf_bytes = nbits << XFS_BLI_SHIFT; 1878 reg_buf_bytes = nbits << XFS_BLF_SHIFT;
1827 item_index++; 1879 item_index++;
1828 } 1880 }
1829 1881
@@ -1837,7 +1889,7 @@ xlog_recover_do_inode_buffer(
1837 } 1889 }
1838 1890
1839 ASSERT(item->ri_buf[item_index].i_addr != NULL); 1891 ASSERT(item->ri_buf[item_index].i_addr != NULL);
1840 ASSERT((item->ri_buf[item_index].i_len % XFS_BLI_CHUNK) == 0); 1892 ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);
1841 ASSERT((reg_buf_offset + reg_buf_bytes) <= XFS_BUF_COUNT(bp)); 1893 ASSERT((reg_buf_offset + reg_buf_bytes) <= XFS_BUF_COUNT(bp));
1842 1894
1843 /* 1895 /*
@@ -1874,6 +1926,7 @@ xlog_recover_do_inode_buffer(
1874/*ARGSUSED*/ 1926/*ARGSUSED*/
1875STATIC void 1927STATIC void
1876xlog_recover_do_reg_buffer( 1928xlog_recover_do_reg_buffer(
1929 struct xfs_mount *mp,
1877 xlog_recover_item_t *item, 1930 xlog_recover_item_t *item,
1878 xfs_buf_t *bp, 1931 xfs_buf_t *bp,
1879 xfs_buf_log_format_t *buf_f) 1932 xfs_buf_log_format_t *buf_f)
@@ -1885,6 +1938,8 @@ xlog_recover_do_reg_buffer(
1885 unsigned int map_size = 0; 1938 unsigned int map_size = 0;
1886 int error; 1939 int error;
1887 1940
1941 trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f);
1942
1888 switch (buf_f->blf_type) { 1943 switch (buf_f->blf_type) {
1889 case XFS_LI_BUF: 1944 case XFS_LI_BUF:
1890 data_map = buf_f->blf_data_map; 1945 data_map = buf_f->blf_data_map;
@@ -1900,9 +1955,9 @@ xlog_recover_do_reg_buffer(
1900 nbits = xfs_contig_bits(data_map, map_size, bit); 1955 nbits = xfs_contig_bits(data_map, map_size, bit);
1901 ASSERT(nbits > 0); 1956 ASSERT(nbits > 0);
1902 ASSERT(item->ri_buf[i].i_addr != NULL); 1957 ASSERT(item->ri_buf[i].i_addr != NULL);
1903 ASSERT(item->ri_buf[i].i_len % XFS_BLI_CHUNK == 0); 1958 ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);
1904 ASSERT(XFS_BUF_COUNT(bp) >= 1959 ASSERT(XFS_BUF_COUNT(bp) >=
1905 ((uint)bit << XFS_BLI_SHIFT)+(nbits<<XFS_BLI_SHIFT)); 1960 ((uint)bit << XFS_BLF_SHIFT)+(nbits<<XFS_BLF_SHIFT));
1906 1961
1907 /* 1962 /*
1908 * Do a sanity check if this is a dquot buffer. Just checking 1963 * Do a sanity check if this is a dquot buffer. Just checking
@@ -1911,7 +1966,7 @@ xlog_recover_do_reg_buffer(
1911 */ 1966 */
1912 error = 0; 1967 error = 0;
1913 if (buf_f->blf_flags & 1968 if (buf_f->blf_flags &
1914 (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) { 1969 (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
1915 if (item->ri_buf[i].i_addr == NULL) { 1970 if (item->ri_buf[i].i_addr == NULL) {
1916 cmn_err(CE_ALERT, 1971 cmn_err(CE_ALERT,
1917 "XFS: NULL dquot in %s.", __func__); 1972 "XFS: NULL dquot in %s.", __func__);
@@ -1932,9 +1987,9 @@ xlog_recover_do_reg_buffer(
1932 } 1987 }
1933 1988
1934 memcpy(xfs_buf_offset(bp, 1989 memcpy(xfs_buf_offset(bp,
1935 (uint)bit << XFS_BLI_SHIFT), /* dest */ 1990 (uint)bit << XFS_BLF_SHIFT), /* dest */
1936 item->ri_buf[i].i_addr, /* source */ 1991 item->ri_buf[i].i_addr, /* source */
1937 nbits<<XFS_BLI_SHIFT); /* length */ 1992 nbits<<XFS_BLF_SHIFT); /* length */
1938 next: 1993 next:
1939 i++; 1994 i++;
1940 bit += nbits; 1995 bit += nbits;
@@ -2083,6 +2138,8 @@ xlog_recover_do_dquot_buffer(
2083{ 2138{
2084 uint type; 2139 uint type;
2085 2140
2141 trace_xfs_log_recover_buf_dquot_buf(log, buf_f);
2142
2086 /* 2143 /*
2087 * Filesystems are required to send in quota flags at mount time. 2144 * Filesystems are required to send in quota flags at mount time.
2088 */ 2145 */
@@ -2091,11 +2148,11 @@ xlog_recover_do_dquot_buffer(
2091 } 2148 }
2092 2149
2093 type = 0; 2150 type = 0;
2094 if (buf_f->blf_flags & XFS_BLI_UDQUOT_BUF) 2151 if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF)
2095 type |= XFS_DQ_USER; 2152 type |= XFS_DQ_USER;
2096 if (buf_f->blf_flags & XFS_BLI_PDQUOT_BUF) 2153 if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF)
2097 type |= XFS_DQ_PROJ; 2154 type |= XFS_DQ_PROJ;
2098 if (buf_f->blf_flags & XFS_BLI_GDQUOT_BUF) 2155 if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF)
2099 type |= XFS_DQ_GROUP; 2156 type |= XFS_DQ_GROUP;
2100 /* 2157 /*
2101 * This type of quotas was turned off, so ignore this buffer 2158 * This type of quotas was turned off, so ignore this buffer
@@ -2103,7 +2160,7 @@ xlog_recover_do_dquot_buffer(
2103 if (log->l_quotaoffs_flag & type) 2160 if (log->l_quotaoffs_flag & type)
2104 return; 2161 return;
2105 2162
2106 xlog_recover_do_reg_buffer(item, bp, buf_f); 2163 xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
2107} 2164}
2108 2165
2109/* 2166/*
@@ -2116,7 +2173,7 @@ xlog_recover_do_dquot_buffer(
2116 * here which overlaps that may be stale. 2173 * here which overlaps that may be stale.
2117 * 2174 *
2118 * When meta-data buffers are freed at run time we log a buffer item 2175 * When meta-data buffers are freed at run time we log a buffer item
2119 * with the XFS_BLI_CANCEL bit set to indicate that previous copies 2176 * with the XFS_BLF_CANCEL bit set to indicate that previous copies
2120 * of the buffer in the log should not be replayed at recovery time. 2177 * of the buffer in the log should not be replayed at recovery time.
2121 * This is so that if the blocks covered by the buffer are reused for 2178 * This is so that if the blocks covered by the buffer are reused for
2122 * file data before we crash we don't end up replaying old, freed 2179 * file data before we crash we don't end up replaying old, freed
@@ -2150,7 +2207,7 @@ xlog_recover_do_buffer_trans(
2150 if (pass == XLOG_RECOVER_PASS1) { 2207 if (pass == XLOG_RECOVER_PASS1) {
2151 /* 2208 /*
2152 * In this pass we're only looking for buf items 2209 * In this pass we're only looking for buf items
2153 * with the XFS_BLI_CANCEL bit set. 2210 * with the XFS_BLF_CANCEL bit set.
2154 */ 2211 */
2155 xlog_recover_do_buffer_pass1(log, buf_f); 2212 xlog_recover_do_buffer_pass1(log, buf_f);
2156 return 0; 2213 return 0;
@@ -2164,9 +2221,11 @@ xlog_recover_do_buffer_trans(
2164 */ 2221 */
2165 cancel = xlog_recover_do_buffer_pass2(log, buf_f); 2222 cancel = xlog_recover_do_buffer_pass2(log, buf_f);
2166 if (cancel) { 2223 if (cancel) {
2224 trace_xfs_log_recover_buf_cancel(log, buf_f);
2167 return 0; 2225 return 0;
2168 } 2226 }
2169 } 2227 }
2228 trace_xfs_log_recover_buf_recover(log, buf_f);
2170 switch (buf_f->blf_type) { 2229 switch (buf_f->blf_type) {
2171 case XFS_LI_BUF: 2230 case XFS_LI_BUF:
2172 blkno = buf_f->blf_blkno; 2231 blkno = buf_f->blf_blkno;
@@ -2185,7 +2244,7 @@ xlog_recover_do_buffer_trans(
2185 2244
2186 mp = log->l_mp; 2245 mp = log->l_mp;
2187 buf_flags = XBF_LOCK; 2246 buf_flags = XBF_LOCK;
2188 if (!(flags & XFS_BLI_INODE_BUF)) 2247 if (!(flags & XFS_BLF_INODE_BUF))
2189 buf_flags |= XBF_MAPPED; 2248 buf_flags |= XBF_MAPPED;
2190 2249
2191 bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags); 2250 bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags);
@@ -2198,13 +2257,13 @@ xlog_recover_do_buffer_trans(
2198 } 2257 }
2199 2258
2200 error = 0; 2259 error = 0;
2201 if (flags & XFS_BLI_INODE_BUF) { 2260 if (flags & XFS_BLF_INODE_BUF) {
2202 error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); 2261 error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
2203 } else if (flags & 2262 } else if (flags &
2204 (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) { 2263 (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
2205 xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f); 2264 xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
2206 } else { 2265 } else {
2207 xlog_recover_do_reg_buffer(item, bp, buf_f); 2266 xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
2208 } 2267 }
2209 if (error) 2268 if (error)
2210 return XFS_ERROR(error); 2269 return XFS_ERROR(error);
@@ -2284,8 +2343,10 @@ xlog_recover_do_inode_trans(
2284 if (xlog_check_buffer_cancelled(log, in_f->ilf_blkno, 2343 if (xlog_check_buffer_cancelled(log, in_f->ilf_blkno,
2285 in_f->ilf_len, 0)) { 2344 in_f->ilf_len, 0)) {
2286 error = 0; 2345 error = 0;
2346 trace_xfs_log_recover_inode_cancel(log, in_f);
2287 goto error; 2347 goto error;
2288 } 2348 }
2349 trace_xfs_log_recover_inode_recover(log, in_f);
2289 2350
2290 bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 2351 bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len,
2291 XBF_LOCK); 2352 XBF_LOCK);
@@ -2337,6 +2398,7 @@ xlog_recover_do_inode_trans(
2337 /* do nothing */ 2398 /* do nothing */
2338 } else { 2399 } else {
2339 xfs_buf_relse(bp); 2400 xfs_buf_relse(bp);
2401 trace_xfs_log_recover_inode_skip(log, in_f);
2340 error = 0; 2402 error = 0;
2341 goto error; 2403 goto error;
2342 } 2404 }
@@ -2758,11 +2820,12 @@ xlog_recover_do_trans(
2758 int error = 0; 2820 int error = 0;
2759 xlog_recover_item_t *item; 2821 xlog_recover_item_t *item;
2760 2822
2761 error = xlog_recover_reorder_trans(trans); 2823 error = xlog_recover_reorder_trans(log, trans, pass);
2762 if (error) 2824 if (error)
2763 return error; 2825 return error;
2764 2826
2765 list_for_each_entry(item, &trans->r_itemq, ri_list) { 2827 list_for_each_entry(item, &trans->r_itemq, ri_list) {
2828 trace_xfs_log_recover_item_recover(log, trans, item, pass);
2766 switch (ITEM_TYPE(item)) { 2829 switch (ITEM_TYPE(item)) {
2767 case XFS_LI_BUF: 2830 case XFS_LI_BUF:
2768 error = xlog_recover_do_buffer_trans(log, item, pass); 2831 error = xlog_recover_do_buffer_trans(log, item, pass);
@@ -2919,8 +2982,9 @@ xlog_recover_process_data(
2919 error = xlog_recover_unmount_trans(trans); 2982 error = xlog_recover_unmount_trans(trans);
2920 break; 2983 break;
2921 case XLOG_WAS_CONT_TRANS: 2984 case XLOG_WAS_CONT_TRANS:
2922 error = xlog_recover_add_to_cont_trans(trans, 2985 error = xlog_recover_add_to_cont_trans(log,
2923 dp, be32_to_cpu(ohead->oh_len)); 2986 trans, dp,
2987 be32_to_cpu(ohead->oh_len));
2924 break; 2988 break;
2925 case XLOG_START_TRANS: 2989 case XLOG_START_TRANS:
2926 xlog_warn( 2990 xlog_warn(
@@ -2930,7 +2994,7 @@ xlog_recover_process_data(
2930 break; 2994 break;
2931 case 0: 2995 case 0:
2932 case XLOG_CONTINUE_TRANS: 2996 case XLOG_CONTINUE_TRANS:
2933 error = xlog_recover_add_to_trans(trans, 2997 error = xlog_recover_add_to_trans(log, trans,
2934 dp, be32_to_cpu(ohead->oh_len)); 2998 dp, be32_to_cpu(ohead->oh_len));
2935 break; 2999 break;
2936 default: 3000 default:
@@ -3331,42 +3395,6 @@ xlog_pack_data(
3331 } 3395 }
3332} 3396}
3333 3397
3334#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
3335STATIC void
3336xlog_unpack_data_checksum(
3337 xlog_rec_header_t *rhead,
3338 xfs_caddr_t dp,
3339 xlog_t *log)
3340{
3341 __be32 *up = (__be32 *)dp;
3342 uint chksum = 0;
3343 int i;
3344
3345 /* divide length by 4 to get # words */
3346 for (i=0; i < be32_to_cpu(rhead->h_len) >> 2; i++) {
3347 chksum ^= be32_to_cpu(*up);
3348 up++;
3349 }
3350 if (chksum != be32_to_cpu(rhead->h_chksum)) {
3351 if (rhead->h_chksum ||
3352 ((log->l_flags & XLOG_CHKSUM_MISMATCH) == 0)) {
3353 cmn_err(CE_DEBUG,
3354 "XFS: LogR chksum mismatch: was (0x%x) is (0x%x)\n",
3355 be32_to_cpu(rhead->h_chksum), chksum);
3356 cmn_err(CE_DEBUG,
3357"XFS: Disregard message if filesystem was created with non-DEBUG kernel");
3358 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
3359 cmn_err(CE_DEBUG,
3360 "XFS: LogR this is a LogV2 filesystem\n");
3361 }
3362 log->l_flags |= XLOG_CHKSUM_MISMATCH;
3363 }
3364 }
3365}
3366#else
3367#define xlog_unpack_data_checksum(rhead, dp, log)
3368#endif
3369
3370STATIC void 3398STATIC void
3371xlog_unpack_data( 3399xlog_unpack_data(
3372 xlog_rec_header_t *rhead, 3400 xlog_rec_header_t *rhead,
@@ -3390,8 +3418,6 @@ xlog_unpack_data(
3390 dp += BBSIZE; 3418 dp += BBSIZE;
3391 } 3419 }
3392 } 3420 }
3393
3394 xlog_unpack_data_checksum(rhead, dp, log);
3395} 3421}
3396 3422
3397STATIC int 3423STATIC int
@@ -3490,7 +3516,7 @@ xlog_do_recovery_pass(
3490 hblks = 1; 3516 hblks = 1;
3491 } 3517 }
3492 } else { 3518 } else {
3493 ASSERT(log->l_sectbb_log == 0); 3519 ASSERT(log->l_sectBBsize == 1);
3494 hblks = 1; 3520 hblks = 1;
3495 hbp = xlog_get_bp(log, 1); 3521 hbp = xlog_get_bp(log, 1);
3496 h_size = XLOG_BIG_RECORD_BSIZE; 3522 h_size = XLOG_BIG_RECORD_BSIZE;
@@ -3946,10 +3972,6 @@ xlog_recover_check_summary(
3946 xfs_agf_t *agfp; 3972 xfs_agf_t *agfp;
3947 xfs_buf_t *agfbp; 3973 xfs_buf_t *agfbp;
3948 xfs_buf_t *agibp; 3974 xfs_buf_t *agibp;
3949 xfs_buf_t *sbbp;
3950#ifdef XFS_LOUD_RECOVERY
3951 xfs_sb_t *sbp;
3952#endif
3953 xfs_agnumber_t agno; 3975 xfs_agnumber_t agno;
3954 __uint64_t freeblks; 3976 __uint64_t freeblks;
3955 __uint64_t itotal; 3977 __uint64_t itotal;
@@ -3984,30 +4006,5 @@ xlog_recover_check_summary(
3984 xfs_buf_relse(agibp); 4006 xfs_buf_relse(agibp);
3985 } 4007 }
3986 } 4008 }
3987
3988 sbbp = xfs_getsb(mp, 0);
3989#ifdef XFS_LOUD_RECOVERY
3990 sbp = &mp->m_sb;
3991 xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(sbbp));
3992 cmn_err(CE_NOTE,
3993 "xlog_recover_check_summary: sb_icount %Lu itotal %Lu",
3994 sbp->sb_icount, itotal);
3995 cmn_err(CE_NOTE,
3996 "xlog_recover_check_summary: sb_ifree %Lu itotal %Lu",
3997 sbp->sb_ifree, ifree);
3998 cmn_err(CE_NOTE,
3999 "xlog_recover_check_summary: sb_fdblocks %Lu freeblks %Lu",
4000 sbp->sb_fdblocks, freeblks);
4001#if 0
4002 /*
4003 * This is turned off until I account for the allocation
4004 * btree blocks which live in free space.
4005 */
4006 ASSERT(sbp->sb_icount == itotal);
4007 ASSERT(sbp->sb_ifree == ifree);
4008 ASSERT(sbp->sb_fdblocks == freeblks);
4009#endif
4010#endif
4011 xfs_buf_relse(sbbp);
4012} 4009}
4013#endif /* DEBUG */ 4010#endif /* DEBUG */
diff --git a/fs/xfs/xfs_log_recover.h b/fs/xfs/xfs_log_recover.h
index 75d749207258..1c55ccbb379d 100644
--- a/fs/xfs/xfs_log_recover.h
+++ b/fs/xfs/xfs_log_recover.h
@@ -28,7 +28,7 @@
28#define XLOG_RHASH(tid) \ 28#define XLOG_RHASH(tid) \
29 ((((__uint32_t)tid)>>XLOG_RHASH_SHIFT) & (XLOG_RHASH_SIZE-1)) 29 ((((__uint32_t)tid)>>XLOG_RHASH_SHIFT) & (XLOG_RHASH_SIZE-1))
30 30
31#define XLOG_MAX_REGIONS_IN_ITEM (XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK / 2 + 1) 31#define XLOG_MAX_REGIONS_IN_ITEM (XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK / 2 + 1)
32 32
33 33
34/* 34/*
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index e79b56b4bca6..d7bf38c8cd1c 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1405,13 +1405,6 @@ xfs_mountfs(
1405 xfs_qm_mount_quotas(mp); 1405 xfs_qm_mount_quotas(mp);
1406 } 1406 }
1407 1407
1408#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
1409 if (XFS_IS_QUOTA_ON(mp))
1410 xfs_fs_cmn_err(CE_NOTE, mp, "Disk quotas turned on");
1411 else
1412 xfs_fs_cmn_err(CE_NOTE, mp, "Disk quotas not turned on");
1413#endif
1414
1415 /* 1408 /*
1416 * Now we are mounted, reserve a small amount of unused space for 1409 * Now we are mounted, reserve a small amount of unused space for
1417 * privileged transactions. This is needed so that transaction 1410 * privileged transactions. This is needed so that transaction
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 4fa0bc7b983e..1d2c7eed4eda 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -259,6 +259,7 @@ typedef struct xfs_mount {
259 wait_queue_head_t m_wait_single_sync_task; 259 wait_queue_head_t m_wait_single_sync_task;
260 __int64_t m_update_flags; /* sb flags we need to update 260 __int64_t m_update_flags; /* sb flags we need to update
261 on the next remount,rw */ 261 on the next remount,rw */
262 struct list_head m_mplist; /* inode shrinker mount list */
262} xfs_mount_t; 263} xfs_mount_t;
263 264
264/* 265/*
@@ -267,6 +268,7 @@ typedef struct xfs_mount {
267#define XFS_MOUNT_WSYNC (1ULL << 0) /* for nfs - all metadata ops 268#define XFS_MOUNT_WSYNC (1ULL << 0) /* for nfs - all metadata ops
268 must be synchronous except 269 must be synchronous except
269 for space allocations */ 270 for space allocations */
271#define XFS_MOUNT_DELAYLOG (1ULL << 1) /* delayed logging is enabled */
270#define XFS_MOUNT_DMAPI (1ULL << 2) /* dmapi is enabled */ 272#define XFS_MOUNT_DMAPI (1ULL << 2) /* dmapi is enabled */
271#define XFS_MOUNT_WAS_CLEAN (1ULL << 3) 273#define XFS_MOUNT_WAS_CLEAN (1ULL << 3)
272#define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem 274#define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index fdcab3f81dde..e0e64b113bd6 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -201,9 +201,6 @@ typedef struct xfs_qoff_logformat {
201#define XFS_QMOPT_FORCE_RES 0x0000010 /* ignore quota limits */ 201#define XFS_QMOPT_FORCE_RES 0x0000010 /* ignore quota limits */
202#define XFS_QMOPT_DQSUSER 0x0000020 /* don't cache super users dquot */ 202#define XFS_QMOPT_DQSUSER 0x0000020 /* don't cache super users dquot */
203#define XFS_QMOPT_SBVERSION 0x0000040 /* change superblock version num */ 203#define XFS_QMOPT_SBVERSION 0x0000040 /* change superblock version num */
204#define XFS_QMOPT_QUOTAOFF 0x0000080 /* quotas are being turned off */
205#define XFS_QMOPT_UMOUNTING 0x0000100 /* filesys is being unmounted */
206#define XFS_QMOPT_DOLOG 0x0000200 /* log buf changes (in quotacheck) */
207#define XFS_QMOPT_DOWARN 0x0000400 /* increase warning cnt if needed */ 204#define XFS_QMOPT_DOWARN 0x0000400 /* increase warning cnt if needed */
208#define XFS_QMOPT_DQREPAIR 0x0001000 /* repair dquot if damaged */ 205#define XFS_QMOPT_DQREPAIR 0x0001000 /* repair dquot if damaged */
209#define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */ 206#define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index f73e358bae8d..ce558efa2ea0 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -44,24 +44,14 @@
44#include "xfs_trans_priv.h" 44#include "xfs_trans_priv.h"
45#include "xfs_trans_space.h" 45#include "xfs_trans_space.h"
46#include "xfs_inode_item.h" 46#include "xfs_inode_item.h"
47 47#include "xfs_trace.h"
48
49STATIC void xfs_trans_apply_sb_deltas(xfs_trans_t *);
50STATIC uint xfs_trans_count_vecs(xfs_trans_t *);
51STATIC void xfs_trans_fill_vecs(xfs_trans_t *, xfs_log_iovec_t *);
52STATIC void xfs_trans_uncommit(xfs_trans_t *, uint);
53STATIC void xfs_trans_committed(xfs_trans_t *, int);
54STATIC void xfs_trans_chunk_committed(xfs_log_item_chunk_t *, xfs_lsn_t, int);
55STATIC void xfs_trans_free(xfs_trans_t *);
56 48
57kmem_zone_t *xfs_trans_zone; 49kmem_zone_t *xfs_trans_zone;
58 50
59
60/* 51/*
61 * Reservation functions here avoid a huge stack in xfs_trans_init 52 * Reservation functions here avoid a huge stack in xfs_trans_init
62 * due to register overflow from temporaries in the calculations. 53 * due to register overflow from temporaries in the calculations.
63 */ 54 */
64
65STATIC uint 55STATIC uint
66xfs_calc_write_reservation(xfs_mount_t *mp) 56xfs_calc_write_reservation(xfs_mount_t *mp)
67{ 57{
@@ -254,13 +244,30 @@ _xfs_trans_alloc(
254 tp->t_type = type; 244 tp->t_type = type;
255 tp->t_mountp = mp; 245 tp->t_mountp = mp;
256 tp->t_items_free = XFS_LIC_NUM_SLOTS; 246 tp->t_items_free = XFS_LIC_NUM_SLOTS;
257 tp->t_busy_free = XFS_LBC_NUM_SLOTS;
258 xfs_lic_init(&(tp->t_items)); 247 xfs_lic_init(&(tp->t_items));
259 XFS_LBC_INIT(&(tp->t_busy)); 248 INIT_LIST_HEAD(&tp->t_busy);
260 return tp; 249 return tp;
261} 250}
262 251
263/* 252/*
253 * Free the transaction structure. If there is more clean up
254 * to do when the structure is freed, add it here.
255 */
256STATIC void
257xfs_trans_free(
258 struct xfs_trans *tp)
259{
260 struct xfs_busy_extent *busyp, *n;
261
262 list_for_each_entry_safe(busyp, n, &tp->t_busy, list)
263 xfs_alloc_busy_clear(tp->t_mountp, busyp);
264
265 atomic_dec(&tp->t_mountp->m_active_trans);
266 xfs_trans_free_dqinfo(tp);
267 kmem_zone_free(xfs_trans_zone, tp);
268}
269
270/*
264 * This is called to create a new transaction which will share the 271 * This is called to create a new transaction which will share the
265 * permanent log reservation of the given transaction. The remaining 272 * permanent log reservation of the given transaction. The remaining
266 * unused block and rt extent reservations are also inherited. This 273 * unused block and rt extent reservations are also inherited. This
@@ -283,9 +290,8 @@ xfs_trans_dup(
283 ntp->t_type = tp->t_type; 290 ntp->t_type = tp->t_type;
284 ntp->t_mountp = tp->t_mountp; 291 ntp->t_mountp = tp->t_mountp;
285 ntp->t_items_free = XFS_LIC_NUM_SLOTS; 292 ntp->t_items_free = XFS_LIC_NUM_SLOTS;
286 ntp->t_busy_free = XFS_LBC_NUM_SLOTS;
287 xfs_lic_init(&(ntp->t_items)); 293 xfs_lic_init(&(ntp->t_items));
288 XFS_LBC_INIT(&(ntp->t_busy)); 294 INIT_LIST_HEAD(&ntp->t_busy);
289 295
290 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); 296 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
291 ASSERT(tp->t_ticket != NULL); 297 ASSERT(tp->t_ticket != NULL);
@@ -421,7 +427,6 @@ undo_blocks:
421 return error; 427 return error;
422} 428}
423 429
424
425/* 430/*
426 * Record the indicated change to the given field for application 431 * Record the indicated change to the given field for application
427 * to the file system's superblock when the transaction commits. 432 * to the file system's superblock when the transaction commits.
@@ -650,7 +655,7 @@ xfs_trans_apply_sb_deltas(
650 * XFS_TRANS_SB_DIRTY will not be set when the transaction is updated but we 655 * XFS_TRANS_SB_DIRTY will not be set when the transaction is updated but we
651 * still need to update the incore superblock with the changes. 656 * still need to update the incore superblock with the changes.
652 */ 657 */
653STATIC void 658void
654xfs_trans_unreserve_and_mod_sb( 659xfs_trans_unreserve_and_mod_sb(
655 xfs_trans_t *tp) 660 xfs_trans_t *tp)
656{ 661{
@@ -764,94 +769,256 @@ xfs_trans_unreserve_and_mod_sb(
764 } 769 }
765} 770}
766 771
772/*
773 * Total up the number of log iovecs needed to commit this
774 * transaction. The transaction itself needs one for the
775 * transaction header. Ask each dirty item in turn how many
776 * it needs to get the total.
777 */
778static uint
779xfs_trans_count_vecs(
780 struct xfs_trans *tp)
781{
782 int nvecs;
783 xfs_log_item_desc_t *lidp;
784
785 nvecs = 1;
786 lidp = xfs_trans_first_item(tp);
787 ASSERT(lidp != NULL);
788
789 /* In the non-debug case we need to start bailing out if we
790 * didn't find a log_item here, return zero and let trans_commit
791 * deal with it.
792 */
793 if (lidp == NULL)
794 return 0;
795
796 while (lidp != NULL) {
797 /*
798 * Skip items which aren't dirty in this transaction.
799 */
800 if (!(lidp->lid_flags & XFS_LID_DIRTY)) {
801 lidp = xfs_trans_next_item(tp, lidp);
802 continue;
803 }
804 lidp->lid_size = IOP_SIZE(lidp->lid_item);
805 nvecs += lidp->lid_size;
806 lidp = xfs_trans_next_item(tp, lidp);
807 }
808
809 return nvecs;
810}
767 811
768/* 812/*
769 * xfs_trans_commit 813 * Fill in the vector with pointers to data to be logged
814 * by this transaction. The transaction header takes
815 * the first vector, and then each dirty item takes the
816 * number of vectors it indicated it needed in xfs_trans_count_vecs().
770 * 817 *
771 * Commit the given transaction to the log a/synchronously. 818 * As each item fills in the entries it needs, also pin the item
819 * so that it cannot be flushed out until the log write completes.
820 */
821static void
822xfs_trans_fill_vecs(
823 struct xfs_trans *tp,
824 struct xfs_log_iovec *log_vector)
825{
826 xfs_log_item_desc_t *lidp;
827 struct xfs_log_iovec *vecp;
828 uint nitems;
829
830 /*
831 * Skip over the entry for the transaction header, we'll
832 * fill that in at the end.
833 */
834 vecp = log_vector + 1;
835
836 nitems = 0;
837 lidp = xfs_trans_first_item(tp);
838 ASSERT(lidp);
839 while (lidp) {
840 /* Skip items which aren't dirty in this transaction. */
841 if (!(lidp->lid_flags & XFS_LID_DIRTY)) {
842 lidp = xfs_trans_next_item(tp, lidp);
843 continue;
844 }
845
846 /*
847 * The item may be marked dirty but not log anything. This can
848 * be used to get called when a transaction is committed.
849 */
850 if (lidp->lid_size)
851 nitems++;
852 IOP_FORMAT(lidp->lid_item, vecp);
853 vecp += lidp->lid_size;
854 IOP_PIN(lidp->lid_item);
855 lidp = xfs_trans_next_item(tp, lidp);
856 }
857
858 /*
859 * Now that we've counted the number of items in this transaction, fill
860 * in the transaction header. Note that the transaction header does not
861 * have a log item.
862 */
863 tp->t_header.th_magic = XFS_TRANS_HEADER_MAGIC;
864 tp->t_header.th_type = tp->t_type;
865 tp->t_header.th_num_items = nitems;
866 log_vector->i_addr = (xfs_caddr_t)&tp->t_header;
867 log_vector->i_len = sizeof(xfs_trans_header_t);
868 log_vector->i_type = XLOG_REG_TYPE_TRANSHDR;
869}
870
871/*
872 * The committed item processing consists of calling the committed routine of
873 * each logged item, updating the item's position in the AIL if necessary, and
874 * unpinning each item. If the committed routine returns -1, then do nothing
875 * further with the item because it may have been freed.
772 * 876 *
773 * XFS disk error handling mechanism is not based on a typical 877 * Since items are unlocked when they are copied to the incore log, it is
774 * transaction abort mechanism. Logically after the filesystem 878 * possible for two transactions to be completing and manipulating the same
775 * gets marked 'SHUTDOWN', we can't let any new transactions 879 * item simultaneously. The AIL lock will protect the lsn field of each item.
776 * be durable - ie. committed to disk - because some metadata might 880 * The value of this field can never go backwards.
777 * be inconsistent. In such cases, this returns an error, and the 881 *
778 * caller may assume that all locked objects joined to the transaction 882 * We unpin the items after repositioning them in the AIL, because otherwise
779 * have already been unlocked as if the commit had succeeded. 883 * they could be immediately flushed and we'd have to race with the flusher
780 * Do not reference the transaction structure after this call. 884 * trying to pull the item from the AIL as we add it.
781 */ 885 */
782 /*ARGSUSED*/ 886void
783int 887xfs_trans_item_committed(
784_xfs_trans_commit( 888 struct xfs_log_item *lip,
785 xfs_trans_t *tp, 889 xfs_lsn_t commit_lsn,
786 uint flags, 890 int aborted)
787 int *log_flushed)
788{ 891{
789 xfs_log_iovec_t *log_vector; 892 xfs_lsn_t item_lsn;
790 int nvec; 893 struct xfs_ail *ailp;
791 xfs_mount_t *mp;
792 xfs_lsn_t commit_lsn;
793 /* REFERENCED */
794 int error;
795 int log_flags;
796 int sync;
797#define XFS_TRANS_LOGVEC_COUNT 16
798 xfs_log_iovec_t log_vector_fast[XFS_TRANS_LOGVEC_COUNT];
799 struct xlog_in_core *commit_iclog;
800 int shutdown;
801 894
802 commit_lsn = -1; 895 if (aborted)
896 lip->li_flags |= XFS_LI_ABORTED;
897 item_lsn = IOP_COMMITTED(lip, commit_lsn);
898
899 /* If the committed routine returns -1, item has been freed. */
900 if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0)
901 return;
803 902
804 /* 903 /*
805 * Determine whether this commit is releasing a permanent 904 * If the returned lsn is greater than what it contained before, update
806 * log reservation or not. 905 * the location of the item in the AIL. If it is not, then do nothing.
906 * Items can never move backwards in the AIL.
907 *
908 * While the new lsn should usually be greater, it is possible that a
909 * later transaction completing simultaneously with an earlier one
910 * using the same item could complete first with a higher lsn. This
911 * would cause the earlier transaction to fail the test below.
807 */ 912 */
808 if (flags & XFS_TRANS_RELEASE_LOG_RES) { 913 ailp = lip->li_ailp;
809 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); 914 spin_lock(&ailp->xa_lock);
810 log_flags = XFS_LOG_REL_PERM_RESERV; 915 if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) {
916 /*
917 * This will set the item's lsn to item_lsn and update the
918 * position of the item in the AIL.
919 *
920 * xfs_trans_ail_update() drops the AIL lock.
921 */
922 xfs_trans_ail_update(ailp, lip, item_lsn);
811 } else { 923 } else {
812 log_flags = 0; 924 spin_unlock(&ailp->xa_lock);
813 } 925 }
814 mp = tp->t_mountp;
815 926
816 /* 927 /*
817 * If there is nothing to be logged by the transaction, 928 * Now that we've repositioned the item in the AIL, unpin it so it can
818 * then unlock all of the items associated with the 929 * be flushed. Pass information about buffer stale state down from the
819 * transaction and free the transaction structure. 930 * log item flags, if anyone else stales the buffer we do not want to
820 * Also make sure to return any reserved blocks to 931 * pay any attention to it.
821 * the free pool.
822 */ 932 */
823shut_us_down: 933 IOP_UNPIN(lip);
824 shutdown = XFS_FORCED_SHUTDOWN(mp) ? EIO : 0; 934}
825 if (!(tp->t_flags & XFS_TRANS_DIRTY) || shutdown) { 935
826 xfs_trans_unreserve_and_mod_sb(tp); 936/*
937 * This is typically called by the LM when a transaction has been fully
938 * committed to disk. It needs to unpin the items which have
939 * been logged by the transaction and update their positions
940 * in the AIL if necessary.
941 *
942 * This also gets called when the transactions didn't get written out
943 * because of an I/O error. Abortflag & XFS_LI_ABORTED is set then.
944 */
945STATIC void
946xfs_trans_committed(
947 struct xfs_trans *tp,
948 int abortflag)
949{
950 xfs_log_item_desc_t *lidp;
951 xfs_log_item_chunk_t *licp;
952 xfs_log_item_chunk_t *next_licp;
953
954 /* Call the transaction's completion callback if there is one. */
955 if (tp->t_callback != NULL)
956 tp->t_callback(tp, tp->t_callarg);
957
958 for (lidp = xfs_trans_first_item(tp);
959 lidp != NULL;
960 lidp = xfs_trans_next_item(tp, lidp)) {
961 xfs_trans_item_committed(lidp->lid_item, tp->t_lsn, abortflag);
962 }
963
964 /* free the item chunks, ignoring the embedded chunk */
965 for (licp = tp->t_items.lic_next; licp != NULL; licp = next_licp) {
966 next_licp = licp->lic_next;
967 kmem_free(licp);
968 }
969
970 xfs_trans_free(tp);
971}
972
973/*
974 * Called from the trans_commit code when we notice that
975 * the filesystem is in the middle of a forced shutdown.
976 */
977STATIC void
978xfs_trans_uncommit(
979 struct xfs_trans *tp,
980 uint flags)
981{
982 xfs_log_item_desc_t *lidp;
983
984 for (lidp = xfs_trans_first_item(tp);
985 lidp != NULL;
986 lidp = xfs_trans_next_item(tp, lidp)) {
827 /* 987 /*
828 * It is indeed possible for the transaction to be 988 * Unpin all but those that aren't dirty.
829 * not dirty but the dqinfo portion to be. All that
830 * means is that we have some (non-persistent) quota
831 * reservations that need to be unreserved.
832 */ 989 */
833 xfs_trans_unreserve_and_mod_dquots(tp); 990 if (lidp->lid_flags & XFS_LID_DIRTY)
834 if (tp->t_ticket) { 991 IOP_UNPIN_REMOVE(lidp->lid_item, tp);
835 commit_lsn = xfs_log_done(mp, tp->t_ticket,
836 NULL, log_flags);
837 if (commit_lsn == -1 && !shutdown)
838 shutdown = XFS_ERROR(EIO);
839 }
840 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
841 xfs_trans_free_items(tp, shutdown? XFS_TRANS_ABORT : 0);
842 xfs_trans_free_busy(tp);
843 xfs_trans_free(tp);
844 XFS_STATS_INC(xs_trans_empty);
845 return (shutdown);
846 } 992 }
847 ASSERT(tp->t_ticket != NULL);
848 993
849 /* 994 xfs_trans_unreserve_and_mod_sb(tp);
850 * If we need to update the superblock, then do it now. 995 xfs_trans_unreserve_and_mod_dquots(tp);
851 */ 996
852 if (tp->t_flags & XFS_TRANS_SB_DIRTY) 997 xfs_trans_free_items(tp, NULLCOMMITLSN, flags);
853 xfs_trans_apply_sb_deltas(tp); 998 xfs_trans_free(tp);
854 xfs_trans_apply_dquot_deltas(tp); 999}
1000
1001/*
1002 * Format the transaction direct to the iclog. This isolates the physical
1003 * transaction commit operation from the logical operation and hence allows
1004 * other methods to be introduced without affecting the existing commit path.
1005 */
1006static int
1007xfs_trans_commit_iclog(
1008 struct xfs_mount *mp,
1009 struct xfs_trans *tp,
1010 xfs_lsn_t *commit_lsn,
1011 int flags)
1012{
1013 int shutdown;
1014 int error;
1015 int log_flags = 0;
1016 struct xlog_in_core *commit_iclog;
1017#define XFS_TRANS_LOGVEC_COUNT 16
1018 struct xfs_log_iovec log_vector_fast[XFS_TRANS_LOGVEC_COUNT];
1019 struct xfs_log_iovec *log_vector;
1020 uint nvec;
1021
855 1022
856 /* 1023 /*
857 * Ask each log item how many log_vector entries it will 1024 * Ask each log item how many log_vector entries it will
@@ -861,8 +1028,7 @@ shut_us_down:
861 */ 1028 */
862 nvec = xfs_trans_count_vecs(tp); 1029 nvec = xfs_trans_count_vecs(tp);
863 if (nvec == 0) { 1030 if (nvec == 0) {
864 xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); 1031 return ENOMEM; /* triggers a shutdown! */
865 goto shut_us_down;
866 } else if (nvec <= XFS_TRANS_LOGVEC_COUNT) { 1032 } else if (nvec <= XFS_TRANS_LOGVEC_COUNT) {
867 log_vector = log_vector_fast; 1033 log_vector = log_vector_fast;
868 } else { 1034 } else {
@@ -877,6 +1043,9 @@ shut_us_down:
877 */ 1043 */
878 xfs_trans_fill_vecs(tp, log_vector); 1044 xfs_trans_fill_vecs(tp, log_vector);
879 1045
1046 if (flags & XFS_TRANS_RELEASE_LOG_RES)
1047 log_flags = XFS_LOG_REL_PERM_RESERV;
1048
880 error = xfs_log_write(mp, log_vector, nvec, tp->t_ticket, &(tp->t_lsn)); 1049 error = xfs_log_write(mp, log_vector, nvec, tp->t_ticket, &(tp->t_lsn));
881 1050
882 /* 1051 /*
@@ -884,18 +1053,19 @@ shut_us_down:
884 * at any time after this call. However, all the items associated 1053 * at any time after this call. However, all the items associated
885 * with the transaction are still locked and pinned in memory. 1054 * with the transaction are still locked and pinned in memory.
886 */ 1055 */
887 commit_lsn = xfs_log_done(mp, tp->t_ticket, &commit_iclog, log_flags); 1056 *commit_lsn = xfs_log_done(mp, tp->t_ticket, &commit_iclog, log_flags);
888 1057
889 tp->t_commit_lsn = commit_lsn; 1058 tp->t_commit_lsn = *commit_lsn;
890 if (nvec > XFS_TRANS_LOGVEC_COUNT) { 1059 trace_xfs_trans_commit_lsn(tp);
1060
1061 if (nvec > XFS_TRANS_LOGVEC_COUNT)
891 kmem_free(log_vector); 1062 kmem_free(log_vector);
892 }
893 1063
894 /* 1064 /*
895 * If we got a log write error. Unpin the logitems that we 1065 * If we got a log write error. Unpin the logitems that we
896 * had pinned, clean up, free trans structure, and return error. 1066 * had pinned, clean up, free trans structure, and return error.
897 */ 1067 */
898 if (error || commit_lsn == -1) { 1068 if (error || *commit_lsn == -1) {
899 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); 1069 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
900 xfs_trans_uncommit(tp, flags|XFS_TRANS_ABORT); 1070 xfs_trans_uncommit(tp, flags|XFS_TRANS_ABORT);
901 return XFS_ERROR(EIO); 1071 return XFS_ERROR(EIO);
@@ -909,8 +1079,6 @@ shut_us_down:
909 */ 1079 */
910 xfs_trans_unreserve_and_mod_sb(tp); 1080 xfs_trans_unreserve_and_mod_sb(tp);
911 1081
912 sync = tp->t_flags & XFS_TRANS_SYNC;
913
914 /* 1082 /*
915 * Tell the LM to call the transaction completion routine 1083 * Tell the LM to call the transaction completion routine
916 * when the log write with LSN commit_lsn completes (e.g. 1084 * when the log write with LSN commit_lsn completes (e.g.
@@ -953,7 +1121,7 @@ shut_us_down:
953 * the commit lsn of this transaction for dependency tracking 1121 * the commit lsn of this transaction for dependency tracking
954 * purposes. 1122 * purposes.
955 */ 1123 */
956 xfs_trans_unlock_items(tp, commit_lsn); 1124 xfs_trans_unlock_items(tp, *commit_lsn);
957 1125
958 /* 1126 /*
959 * If we detected a log error earlier, finish committing 1127 * If we detected a log error earlier, finish committing
@@ -973,156 +1141,204 @@ shut_us_down:
973 * and the items are released we can finally allow the iclog to 1141 * and the items are released we can finally allow the iclog to
974 * go to disk. 1142 * go to disk.
975 */ 1143 */
976 error = xfs_log_release_iclog(mp, commit_iclog); 1144 return xfs_log_release_iclog(mp, commit_iclog);
977
978 /*
979 * If the transaction needs to be synchronous, then force the
980 * log out now and wait for it.
981 */
982 if (sync) {
983 if (!error) {
984 error = _xfs_log_force_lsn(mp, commit_lsn,
985 XFS_LOG_SYNC, log_flushed);
986 }
987 XFS_STATS_INC(xs_trans_sync);
988 } else {
989 XFS_STATS_INC(xs_trans_async);
990 }
991
992 return (error);
993} 1145}
994 1146
995
996/* 1147/*
997 * Total up the number of log iovecs needed to commit this 1148 * Walk the log items and allocate log vector structures for
998 * transaction. The transaction itself needs one for the 1149 * each item large enough to fit all the vectors they require.
999 * transaction header. Ask each dirty item in turn how many 1150 * Note that this format differs from the old log vector format in
1000 * it needs to get the total. 1151 * that there is no transaction header in these log vectors.
1001 */ 1152 */
1002STATIC uint 1153STATIC struct xfs_log_vec *
1003xfs_trans_count_vecs( 1154xfs_trans_alloc_log_vecs(
1004 xfs_trans_t *tp) 1155 xfs_trans_t *tp)
1005{ 1156{
1006 int nvecs;
1007 xfs_log_item_desc_t *lidp; 1157 xfs_log_item_desc_t *lidp;
1158 struct xfs_log_vec *lv = NULL;
1159 struct xfs_log_vec *ret_lv = NULL;
1008 1160
1009 nvecs = 1;
1010 lidp = xfs_trans_first_item(tp); 1161 lidp = xfs_trans_first_item(tp);
1011 ASSERT(lidp != NULL);
1012 1162
1013 /* In the non-debug case we need to start bailing out if we 1163 /* Bail out if we didn't find a log item. */
1014 * didn't find a log_item here, return zero and let trans_commit 1164 if (!lidp) {
1015 * deal with it. 1165 ASSERT(0);
1016 */ 1166 return NULL;
1017 if (lidp == NULL) 1167 }
1018 return 0;
1019 1168
1020 while (lidp != NULL) { 1169 while (lidp != NULL) {
1021 /* 1170 struct xfs_log_vec *new_lv;
1022 * Skip items which aren't dirty in this transaction. 1171
1023 */ 1172 /* Skip items which aren't dirty in this transaction. */
1024 if (!(lidp->lid_flags & XFS_LID_DIRTY)) { 1173 if (!(lidp->lid_flags & XFS_LID_DIRTY)) {
1025 lidp = xfs_trans_next_item(tp, lidp); 1174 lidp = xfs_trans_next_item(tp, lidp);
1026 continue; 1175 continue;
1027 } 1176 }
1177
1178 /* Skip items that do not have any vectors for writing */
1028 lidp->lid_size = IOP_SIZE(lidp->lid_item); 1179 lidp->lid_size = IOP_SIZE(lidp->lid_item);
1029 nvecs += lidp->lid_size; 1180 if (!lidp->lid_size) {
1181 lidp = xfs_trans_next_item(tp, lidp);
1182 continue;
1183 }
1184
1185 new_lv = kmem_zalloc(sizeof(*new_lv) +
1186 lidp->lid_size * sizeof(struct xfs_log_iovec),
1187 KM_SLEEP);
1188
1189 /* The allocated iovec region lies beyond the log vector. */
1190 new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1];
1191 new_lv->lv_niovecs = lidp->lid_size;
1192 new_lv->lv_item = lidp->lid_item;
1193 if (!ret_lv)
1194 ret_lv = new_lv;
1195 else
1196 lv->lv_next = new_lv;
1197 lv = new_lv;
1030 lidp = xfs_trans_next_item(tp, lidp); 1198 lidp = xfs_trans_next_item(tp, lidp);
1031 } 1199 }
1032 1200
1033 return nvecs; 1201 return ret_lv;
1034} 1202}
1035 1203
1036/* 1204static int
1037 * Called from the trans_commit code when we notice that 1205xfs_trans_commit_cil(
1038 * the filesystem is in the middle of a forced shutdown. 1206 struct xfs_mount *mp,
1039 */ 1207 struct xfs_trans *tp,
1040STATIC void 1208 xfs_lsn_t *commit_lsn,
1041xfs_trans_uncommit( 1209 int flags)
1042 xfs_trans_t *tp,
1043 uint flags)
1044{ 1210{
1045 xfs_log_item_desc_t *lidp; 1211 struct xfs_log_vec *log_vector;
1212 int error;
1046 1213
1047 for (lidp = xfs_trans_first_item(tp); 1214 /*
1048 lidp != NULL; 1215 * Get each log item to allocate a vector structure for
1049 lidp = xfs_trans_next_item(tp, lidp)) { 1216 * the log item to to pass to the log write code. The
1050 /* 1217 * CIL commit code will format the vector and save it away.
1051 * Unpin all but those that aren't dirty. 1218 */
1052 */ 1219 log_vector = xfs_trans_alloc_log_vecs(tp);
1053 if (lidp->lid_flags & XFS_LID_DIRTY) 1220 if (!log_vector)
1054 IOP_UNPIN_REMOVE(lidp->lid_item, tp); 1221 return ENOMEM;
1055 }
1056 1222
1057 xfs_trans_unreserve_and_mod_sb(tp); 1223 error = xfs_log_commit_cil(mp, tp, log_vector, commit_lsn, flags);
1058 xfs_trans_unreserve_and_mod_dquots(tp); 1224 if (error)
1225 return error;
1059 1226
1060 xfs_trans_free_items(tp, flags); 1227 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
1061 xfs_trans_free_busy(tp); 1228
1229 /* xfs_trans_free_items() unlocks them first */
1230 xfs_trans_free_items(tp, *commit_lsn, 0);
1062 xfs_trans_free(tp); 1231 xfs_trans_free(tp);
1232 return 0;
1063} 1233}
1064 1234
1065/* 1235/*
1066 * Fill in the vector with pointers to data to be logged 1236 * xfs_trans_commit
1067 * by this transaction. The transaction header takes
1068 * the first vector, and then each dirty item takes the
1069 * number of vectors it indicated it needed in xfs_trans_count_vecs().
1070 * 1237 *
1071 * As each item fills in the entries it needs, also pin the item 1238 * Commit the given transaction to the log a/synchronously.
1072 * so that it cannot be flushed out until the log write completes. 1239 *
1240 * XFS disk error handling mechanism is not based on a typical
1241 * transaction abort mechanism. Logically after the filesystem
1242 * gets marked 'SHUTDOWN', we can't let any new transactions
1243 * be durable - ie. committed to disk - because some metadata might
1244 * be inconsistent. In such cases, this returns an error, and the
1245 * caller may assume that all locked objects joined to the transaction
1246 * have already been unlocked as if the commit had succeeded.
1247 * Do not reference the transaction structure after this call.
1073 */ 1248 */
1074STATIC void 1249int
1075xfs_trans_fill_vecs( 1250_xfs_trans_commit(
1076 xfs_trans_t *tp, 1251 struct xfs_trans *tp,
1077 xfs_log_iovec_t *log_vector) 1252 uint flags,
1253 int *log_flushed)
1078{ 1254{
1079 xfs_log_item_desc_t *lidp; 1255 struct xfs_mount *mp = tp->t_mountp;
1080 xfs_log_iovec_t *vecp; 1256 xfs_lsn_t commit_lsn = -1;
1081 uint nitems; 1257 int error = 0;
1258 int log_flags = 0;
1259 int sync = tp->t_flags & XFS_TRANS_SYNC;
1082 1260
1083 /* 1261 /*
1084 * Skip over the entry for the transaction header, we'll 1262 * Determine whether this commit is releasing a permanent
1085 * fill that in at the end. 1263 * log reservation or not.
1086 */ 1264 */
1087 vecp = log_vector + 1; /* pointer arithmetic */ 1265 if (flags & XFS_TRANS_RELEASE_LOG_RES) {
1266 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
1267 log_flags = XFS_LOG_REL_PERM_RESERV;
1268 }
1088 1269
1089 nitems = 0; 1270 /*
1090 lidp = xfs_trans_first_item(tp); 1271 * If there is nothing to be logged by the transaction,
1091 ASSERT(lidp != NULL); 1272 * then unlock all of the items associated with the
1092 while (lidp != NULL) { 1273 * transaction and free the transaction structure.
1093 /* 1274 * Also make sure to return any reserved blocks to
1094 * Skip items which aren't dirty in this transaction. 1275 * the free pool.
1095 */ 1276 */
1096 if (!(lidp->lid_flags & XFS_LID_DIRTY)) { 1277 if (!(tp->t_flags & XFS_TRANS_DIRTY))
1097 lidp = xfs_trans_next_item(tp, lidp); 1278 goto out_unreserve;
1098 continue; 1279
1099 } 1280 if (XFS_FORCED_SHUTDOWN(mp)) {
1100 /* 1281 error = XFS_ERROR(EIO);
1101 * The item may be marked dirty but not log anything. 1282 goto out_unreserve;
1102 * This can be used to get called when a transaction 1283 }
1103 * is committed. 1284
1104 */ 1285 ASSERT(tp->t_ticket != NULL);
1105 if (lidp->lid_size) { 1286
1106 nitems++; 1287 /*
1288 * If we need to update the superblock, then do it now.
1289 */
1290 if (tp->t_flags & XFS_TRANS_SB_DIRTY)
1291 xfs_trans_apply_sb_deltas(tp);
1292 xfs_trans_apply_dquot_deltas(tp);
1293
1294 if (mp->m_flags & XFS_MOUNT_DELAYLOG)
1295 error = xfs_trans_commit_cil(mp, tp, &commit_lsn, flags);
1296 else
1297 error = xfs_trans_commit_iclog(mp, tp, &commit_lsn, flags);
1298
1299 if (error == ENOMEM) {
1300 xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
1301 error = XFS_ERROR(EIO);
1302 goto out_unreserve;
1303 }
1304
1305 /*
1306 * If the transaction needs to be synchronous, then force the
1307 * log out now and wait for it.
1308 */
1309 if (sync) {
1310 if (!error) {
1311 error = _xfs_log_force_lsn(mp, commit_lsn,
1312 XFS_LOG_SYNC, log_flushed);
1107 } 1313 }
1108 IOP_FORMAT(lidp->lid_item, vecp); 1314 XFS_STATS_INC(xs_trans_sync);
1109 vecp += lidp->lid_size; /* pointer arithmetic */ 1315 } else {
1110 IOP_PIN(lidp->lid_item); 1316 XFS_STATS_INC(xs_trans_async);
1111 lidp = xfs_trans_next_item(tp, lidp);
1112 } 1317 }
1113 1318
1319 return error;
1320
1321out_unreserve:
1322 xfs_trans_unreserve_and_mod_sb(tp);
1323
1114 /* 1324 /*
1115 * Now that we've counted the number of items in this 1325 * It is indeed possible for the transaction to be not dirty but
1116 * transaction, fill in the transaction header. 1326 * the dqinfo portion to be. All that means is that we have some
1327 * (non-persistent) quota reservations that need to be unreserved.
1117 */ 1328 */
1118 tp->t_header.th_magic = XFS_TRANS_HEADER_MAGIC; 1329 xfs_trans_unreserve_and_mod_dquots(tp);
1119 tp->t_header.th_type = tp->t_type; 1330 if (tp->t_ticket) {
1120 tp->t_header.th_num_items = nitems; 1331 commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
1121 log_vector->i_addr = (xfs_caddr_t)&tp->t_header; 1332 if (commit_lsn == -1 && !error)
1122 log_vector->i_len = sizeof(xfs_trans_header_t); 1333 error = XFS_ERROR(EIO);
1123 log_vector->i_type = XLOG_REG_TYPE_TRANSHDR; 1334 }
1124} 1335 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
1336 xfs_trans_free_items(tp, NULLCOMMITLSN, error ? XFS_TRANS_ABORT : 0);
1337 xfs_trans_free(tp);
1125 1338
1339 XFS_STATS_INC(xs_trans_empty);
1340 return error;
1341}
1126 1342
1127/* 1343/*
1128 * Unlock all of the transaction's items and free the transaction. 1344 * Unlock all of the transaction's items and free the transaction.
@@ -1195,25 +1411,10 @@ xfs_trans_cancel(
1195 /* mark this thread as no longer being in a transaction */ 1411 /* mark this thread as no longer being in a transaction */
1196 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); 1412 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
1197 1413
1198 xfs_trans_free_items(tp, flags); 1414 xfs_trans_free_items(tp, NULLCOMMITLSN, flags);
1199 xfs_trans_free_busy(tp);
1200 xfs_trans_free(tp); 1415 xfs_trans_free(tp);
1201} 1416}
1202 1417
1203
1204/*
1205 * Free the transaction structure. If there is more clean up
1206 * to do when the structure is freed, add it here.
1207 */
1208STATIC void
1209xfs_trans_free(
1210 xfs_trans_t *tp)
1211{
1212 atomic_dec(&tp->t_mountp->m_active_trans);
1213 xfs_trans_free_dqinfo(tp);
1214 kmem_zone_free(xfs_trans_zone, tp);
1215}
1216
1217/* 1418/*
1218 * Roll from one trans in the sequence of PERMANENT transactions to 1419 * Roll from one trans in the sequence of PERMANENT transactions to
1219 * the next: permanent transactions are only flushed out when 1420 * the next: permanent transactions are only flushed out when
@@ -1283,174 +1484,3 @@ xfs_trans_roll(
1283 xfs_trans_ihold(trans, dp); 1484 xfs_trans_ihold(trans, dp);
1284 return 0; 1485 return 0;
1285} 1486}
1286
1287/*
1288 * THIS SHOULD BE REWRITTEN TO USE xfs_trans_next_item().
1289 *
1290 * This is typically called by the LM when a transaction has been fully
1291 * committed to disk. It needs to unpin the items which have
1292 * been logged by the transaction and update their positions
1293 * in the AIL if necessary.
1294 * This also gets called when the transactions didn't get written out
1295 * because of an I/O error. Abortflag & XFS_LI_ABORTED is set then.
1296 *
1297 * Call xfs_trans_chunk_committed() to process the items in
1298 * each chunk.
1299 */
1300STATIC void
1301xfs_trans_committed(
1302 xfs_trans_t *tp,
1303 int abortflag)
1304{
1305 xfs_log_item_chunk_t *licp;
1306 xfs_log_item_chunk_t *next_licp;
1307 xfs_log_busy_chunk_t *lbcp;
1308 xfs_log_busy_slot_t *lbsp;
1309 int i;
1310
1311 /*
1312 * Call the transaction's completion callback if there
1313 * is one.
1314 */
1315 if (tp->t_callback != NULL) {
1316 tp->t_callback(tp, tp->t_callarg);
1317 }
1318
1319 /*
1320 * Special case the chunk embedded in the transaction.
1321 */
1322 licp = &(tp->t_items);
1323 if (!(xfs_lic_are_all_free(licp))) {
1324 xfs_trans_chunk_committed(licp, tp->t_lsn, abortflag);
1325 }
1326
1327 /*
1328 * Process the items in each chunk in turn.
1329 */
1330 licp = licp->lic_next;
1331 while (licp != NULL) {
1332 ASSERT(!xfs_lic_are_all_free(licp));
1333 xfs_trans_chunk_committed(licp, tp->t_lsn, abortflag);
1334 next_licp = licp->lic_next;
1335 kmem_free(licp);
1336 licp = next_licp;
1337 }
1338
1339 /*
1340 * Clear all the per-AG busy list items listed in this transaction
1341 */
1342 lbcp = &tp->t_busy;
1343 while (lbcp != NULL) {
1344 for (i = 0, lbsp = lbcp->lbc_busy; i < lbcp->lbc_unused; i++, lbsp++) {
1345 if (!XFS_LBC_ISFREE(lbcp, i)) {
1346 xfs_alloc_clear_busy(tp, lbsp->lbc_ag,
1347 lbsp->lbc_idx);
1348 }
1349 }
1350 lbcp = lbcp->lbc_next;
1351 }
1352 xfs_trans_free_busy(tp);
1353
1354 /*
1355 * That's it for the transaction structure. Free it.
1356 */
1357 xfs_trans_free(tp);
1358}
1359
1360/*
1361 * This is called to perform the commit processing for each
1362 * item described by the given chunk.
1363 *
1364 * The commit processing consists of unlocking items which were
1365 * held locked with the SYNC_UNLOCK attribute, calling the committed
1366 * routine of each logged item, updating the item's position in the AIL
1367 * if necessary, and unpinning each item. If the committed routine
1368 * returns -1, then do nothing further with the item because it
1369 * may have been freed.
1370 *
1371 * Since items are unlocked when they are copied to the incore
1372 * log, it is possible for two transactions to be completing
1373 * and manipulating the same item simultaneously. The AIL lock
1374 * will protect the lsn field of each item. The value of this
1375 * field can never go backwards.
1376 *
1377 * We unpin the items after repositioning them in the AIL, because
1378 * otherwise they could be immediately flushed and we'd have to race
1379 * with the flusher trying to pull the item from the AIL as we add it.
1380 */
1381STATIC void
1382xfs_trans_chunk_committed(
1383 xfs_log_item_chunk_t *licp,
1384 xfs_lsn_t lsn,
1385 int aborted)
1386{
1387 xfs_log_item_desc_t *lidp;
1388 xfs_log_item_t *lip;
1389 xfs_lsn_t item_lsn;
1390 int i;
1391
1392 lidp = licp->lic_descs;
1393 for (i = 0; i < licp->lic_unused; i++, lidp++) {
1394 struct xfs_ail *ailp;
1395
1396 if (xfs_lic_isfree(licp, i)) {
1397 continue;
1398 }
1399
1400 lip = lidp->lid_item;
1401 if (aborted)
1402 lip->li_flags |= XFS_LI_ABORTED;
1403
1404 /*
1405 * Send in the ABORTED flag to the COMMITTED routine
1406 * so that it knows whether the transaction was aborted
1407 * or not.
1408 */
1409 item_lsn = IOP_COMMITTED(lip, lsn);
1410
1411 /*
1412 * If the committed routine returns -1, make
1413 * no more references to the item.
1414 */
1415 if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0) {
1416 continue;
1417 }
1418
1419 /*
1420 * If the returned lsn is greater than what it
1421 * contained before, update the location of the
1422 * item in the AIL. If it is not, then do nothing.
1423 * Items can never move backwards in the AIL.
1424 *
1425 * While the new lsn should usually be greater, it
1426 * is possible that a later transaction completing
1427 * simultaneously with an earlier one using the
1428 * same item could complete first with a higher lsn.
1429 * This would cause the earlier transaction to fail
1430 * the test below.
1431 */
1432 ailp = lip->li_ailp;
1433 spin_lock(&ailp->xa_lock);
1434 if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) {
1435 /*
1436 * This will set the item's lsn to item_lsn
1437 * and update the position of the item in
1438 * the AIL.
1439 *
1440 * xfs_trans_ail_update() drops the AIL lock.
1441 */
1442 xfs_trans_ail_update(ailp, lip, item_lsn);
1443 } else {
1444 spin_unlock(&ailp->xa_lock);
1445 }
1446
1447 /*
1448 * Now that we've repositioned the item in the AIL,
1449 * unpin it so it can be flushed. Pass information
1450 * about buffer stale state down from the log item
1451 * flags, if anyone else stales the buffer we do not
1452 * want to pay any attention to it.
1453 */
1454 IOP_UNPIN(lip, lidp->lid_flags & XFS_LID_BUF_STALE);
1455 }
1456}
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 79c8bab9dfff..8c69e7824f68 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -49,6 +49,15 @@ typedef struct xfs_trans_header {
49#define XFS_LI_DQUOT 0x123d 49#define XFS_LI_DQUOT 0x123d
50#define XFS_LI_QUOTAOFF 0x123e 50#define XFS_LI_QUOTAOFF 0x123e
51 51
52#define XFS_LI_TYPE_DESC \
53 { XFS_LI_EFI, "XFS_LI_EFI" }, \
54 { XFS_LI_EFD, "XFS_LI_EFD" }, \
55 { XFS_LI_IUNLINK, "XFS_LI_IUNLINK" }, \
56 { XFS_LI_INODE, "XFS_LI_INODE" }, \
57 { XFS_LI_BUF, "XFS_LI_BUF" }, \
58 { XFS_LI_DQUOT, "XFS_LI_DQUOT" }, \
59 { XFS_LI_QUOTAOFF, "XFS_LI_QUOTAOFF" }
60
52/* 61/*
53 * Transaction types. Used to distinguish types of buffers. 62 * Transaction types. Used to distinguish types of buffers.
54 */ 63 */
@@ -97,7 +106,8 @@ typedef struct xfs_trans_header {
97#define XFS_TRANS_GROWFSRT_FREE 39 106#define XFS_TRANS_GROWFSRT_FREE 39
98#define XFS_TRANS_SWAPEXT 40 107#define XFS_TRANS_SWAPEXT 40
99#define XFS_TRANS_SB_COUNT 41 108#define XFS_TRANS_SB_COUNT 41
100#define XFS_TRANS_TYPE_MAX 41 109#define XFS_TRANS_CHECKPOINT 42
110#define XFS_TRANS_TYPE_MAX 42
101/* new transaction types need to be reflected in xfs_logprint(8) */ 111/* new transaction types need to be reflected in xfs_logprint(8) */
102 112
103#define XFS_TRANS_TYPES \ 113#define XFS_TRANS_TYPES \
@@ -139,6 +149,7 @@ typedef struct xfs_trans_header {
139 { XFS_TRANS_GROWFSRT_FREE, "GROWFSRT_FREE" }, \ 149 { XFS_TRANS_GROWFSRT_FREE, "GROWFSRT_FREE" }, \
140 { XFS_TRANS_SWAPEXT, "SWAPEXT" }, \ 150 { XFS_TRANS_SWAPEXT, "SWAPEXT" }, \
141 { XFS_TRANS_SB_COUNT, "SB_COUNT" }, \ 151 { XFS_TRANS_SB_COUNT, "SB_COUNT" }, \
152 { XFS_TRANS_CHECKPOINT, "CHECKPOINT" }, \
142 { XFS_TRANS_DUMMY1, "DUMMY1" }, \ 153 { XFS_TRANS_DUMMY1, "DUMMY1" }, \
143 { XFS_TRANS_DUMMY2, "DUMMY2" }, \ 154 { XFS_TRANS_DUMMY2, "DUMMY2" }, \
144 { XLOG_UNMOUNT_REC_TYPE, "UNMOUNT" } 155 { XLOG_UNMOUNT_REC_TYPE, "UNMOUNT" }
@@ -159,7 +170,6 @@ typedef struct xfs_log_item_desc {
159 170
160#define XFS_LID_DIRTY 0x1 171#define XFS_LID_DIRTY 0x1
161#define XFS_LID_PINNED 0x2 172#define XFS_LID_PINNED 0x2
162#define XFS_LID_BUF_STALE 0x8
163 173
164/* 174/*
165 * This structure is used to maintain a chunk list of log_item_desc 175 * This structure is used to maintain a chunk list of log_item_desc
@@ -805,6 +815,7 @@ struct xfs_log_item_desc;
805struct xfs_mount; 815struct xfs_mount;
806struct xfs_trans; 816struct xfs_trans;
807struct xfs_dquot_acct; 817struct xfs_dquot_acct;
818struct xfs_busy_extent;
808 819
809typedef struct xfs_log_item { 820typedef struct xfs_log_item {
810 struct list_head li_ail; /* AIL pointers */ 821 struct list_head li_ail; /* AIL pointers */
@@ -820,6 +831,11 @@ typedef struct xfs_log_item {
820 /* buffer item iodone */ 831 /* buffer item iodone */
821 /* callback func */ 832 /* callback func */
822 struct xfs_item_ops *li_ops; /* function list */ 833 struct xfs_item_ops *li_ops; /* function list */
834
835 /* delayed logging */
836 struct list_head li_cil; /* CIL pointers */
837 struct xfs_log_vec *li_lv; /* active log vector */
838 xfs_lsn_t li_seq; /* CIL commit seq */
823} xfs_log_item_t; 839} xfs_log_item_t;
824 840
825#define XFS_LI_IN_AIL 0x1 841#define XFS_LI_IN_AIL 0x1
@@ -833,7 +849,7 @@ typedef struct xfs_item_ops {
833 uint (*iop_size)(xfs_log_item_t *); 849 uint (*iop_size)(xfs_log_item_t *);
834 void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *); 850 void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *);
835 void (*iop_pin)(xfs_log_item_t *); 851 void (*iop_pin)(xfs_log_item_t *);
836 void (*iop_unpin)(xfs_log_item_t *, int); 852 void (*iop_unpin)(xfs_log_item_t *);
837 void (*iop_unpin_remove)(xfs_log_item_t *, struct xfs_trans *); 853 void (*iop_unpin_remove)(xfs_log_item_t *, struct xfs_trans *);
838 uint (*iop_trylock)(xfs_log_item_t *); 854 uint (*iop_trylock)(xfs_log_item_t *);
839 void (*iop_unlock)(xfs_log_item_t *); 855 void (*iop_unlock)(xfs_log_item_t *);
@@ -846,7 +862,7 @@ typedef struct xfs_item_ops {
846#define IOP_SIZE(ip) (*(ip)->li_ops->iop_size)(ip) 862#define IOP_SIZE(ip) (*(ip)->li_ops->iop_size)(ip)
847#define IOP_FORMAT(ip,vp) (*(ip)->li_ops->iop_format)(ip, vp) 863#define IOP_FORMAT(ip,vp) (*(ip)->li_ops->iop_format)(ip, vp)
848#define IOP_PIN(ip) (*(ip)->li_ops->iop_pin)(ip) 864#define IOP_PIN(ip) (*(ip)->li_ops->iop_pin)(ip)
849#define IOP_UNPIN(ip, flags) (*(ip)->li_ops->iop_unpin)(ip, flags) 865#define IOP_UNPIN(ip) (*(ip)->li_ops->iop_unpin)(ip)
850#define IOP_UNPIN_REMOVE(ip,tp) (*(ip)->li_ops->iop_unpin_remove)(ip, tp) 866#define IOP_UNPIN_REMOVE(ip,tp) (*(ip)->li_ops->iop_unpin_remove)(ip, tp)
851#define IOP_TRYLOCK(ip) (*(ip)->li_ops->iop_trylock)(ip) 867#define IOP_TRYLOCK(ip) (*(ip)->li_ops->iop_trylock)(ip)
852#define IOP_UNLOCK(ip) (*(ip)->li_ops->iop_unlock)(ip) 868#define IOP_UNLOCK(ip) (*(ip)->li_ops->iop_unlock)(ip)
@@ -864,34 +880,6 @@ typedef struct xfs_item_ops {
864#define XFS_ITEM_PUSHBUF 3 880#define XFS_ITEM_PUSHBUF 3
865 881
866/* 882/*
867 * This structure is used to maintain a list of block ranges that have been
868 * freed in the transaction. The ranges are listed in the perag[] busy list
869 * between when they're freed and the transaction is committed to disk.
870 */
871
872typedef struct xfs_log_busy_slot {
873 xfs_agnumber_t lbc_ag;
874 ushort lbc_idx; /* index in perag.busy[] */
875} xfs_log_busy_slot_t;
876
877#define XFS_LBC_NUM_SLOTS 31
878typedef struct xfs_log_busy_chunk {
879 struct xfs_log_busy_chunk *lbc_next;
880 uint lbc_free; /* free slots bitmask */
881 ushort lbc_unused; /* first unused */
882 xfs_log_busy_slot_t lbc_busy[XFS_LBC_NUM_SLOTS];
883} xfs_log_busy_chunk_t;
884
885#define XFS_LBC_MAX_SLOT (XFS_LBC_NUM_SLOTS - 1)
886#define XFS_LBC_FREEMASK ((1U << XFS_LBC_NUM_SLOTS) - 1)
887
888#define XFS_LBC_INIT(cp) ((cp)->lbc_free = XFS_LBC_FREEMASK)
889#define XFS_LBC_CLAIM(cp, slot) ((cp)->lbc_free &= ~(1 << (slot)))
890#define XFS_LBC_SLOT(cp, slot) (&((cp)->lbc_busy[(slot)]))
891#define XFS_LBC_VACANCY(cp) (((cp)->lbc_free) & XFS_LBC_FREEMASK)
892#define XFS_LBC_ISFREE(cp, slot) ((cp)->lbc_free & (1 << (slot)))
893
894/*
895 * This is the type of function which can be given to xfs_trans_callback() 883 * This is the type of function which can be given to xfs_trans_callback()
896 * to be called upon the transaction's commit to disk. 884 * to be called upon the transaction's commit to disk.
897 */ 885 */
@@ -942,8 +930,7 @@ typedef struct xfs_trans {
942 unsigned int t_items_free; /* log item descs free */ 930 unsigned int t_items_free; /* log item descs free */
943 xfs_log_item_chunk_t t_items; /* first log item desc chunk */ 931 xfs_log_item_chunk_t t_items; /* first log item desc chunk */
944 xfs_trans_header_t t_header; /* header for in-log trans */ 932 xfs_trans_header_t t_header; /* header for in-log trans */
945 unsigned int t_busy_free; /* busy descs free */ 933 struct list_head t_busy; /* list of busy extents */
946 xfs_log_busy_chunk_t t_busy; /* busy/async free blocks */
947 unsigned long t_pflags; /* saved process flags state */ 934 unsigned long t_pflags; /* saved process flags state */
948} xfs_trans_t; 935} xfs_trans_t;
949 936
@@ -1017,9 +1004,6 @@ int _xfs_trans_commit(xfs_trans_t *,
1017void xfs_trans_cancel(xfs_trans_t *, int); 1004void xfs_trans_cancel(xfs_trans_t *, int);
1018int xfs_trans_ail_init(struct xfs_mount *); 1005int xfs_trans_ail_init(struct xfs_mount *);
1019void xfs_trans_ail_destroy(struct xfs_mount *); 1006void xfs_trans_ail_destroy(struct xfs_mount *);
1020xfs_log_busy_slot_t *xfs_trans_add_busy(xfs_trans_t *tp,
1021 xfs_agnumber_t ag,
1022 xfs_extlen_t idx);
1023 1007
1024extern kmem_zone_t *xfs_trans_zone; 1008extern kmem_zone_t *xfs_trans_zone;
1025 1009
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index fb586360d1c9..63d81a22f4fd 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -40,11 +40,51 @@
40#include "xfs_rw.h" 40#include "xfs_rw.h"
41#include "xfs_trace.h" 41#include "xfs_trace.h"
42 42
43/*
44 * Check to see if a buffer matching the given parameters is already
45 * a part of the given transaction.
46 */
47STATIC struct xfs_buf *
48xfs_trans_buf_item_match(
49 struct xfs_trans *tp,
50 struct xfs_buftarg *target,
51 xfs_daddr_t blkno,
52 int len)
53{
54 xfs_log_item_chunk_t *licp;
55 xfs_log_item_desc_t *lidp;
56 xfs_buf_log_item_t *blip;
57 int i;
58
59 len = BBTOB(len);
60 for (licp = &tp->t_items; licp != NULL; licp = licp->lic_next) {
61 if (xfs_lic_are_all_free(licp)) {
62 ASSERT(licp == &tp->t_items);
63 ASSERT(licp->lic_next == NULL);
64 return NULL;
65 }
66
67 for (i = 0; i < licp->lic_unused; i++) {
68 /*
69 * Skip unoccupied slots.
70 */
71 if (xfs_lic_isfree(licp, i))
72 continue;
73
74 lidp = xfs_lic_slot(licp, i);
75 blip = (xfs_buf_log_item_t *)lidp->lid_item;
76 if (blip->bli_item.li_type != XFS_LI_BUF)
77 continue;
78
79 if (XFS_BUF_TARGET(blip->bli_buf) == target &&
80 XFS_BUF_ADDR(blip->bli_buf) == blkno &&
81 XFS_BUF_COUNT(blip->bli_buf) == len)
82 return blip->bli_buf;
83 }
84 }
43 85
44STATIC xfs_buf_t *xfs_trans_buf_item_match(xfs_trans_t *, xfs_buftarg_t *, 86 return NULL;
45 xfs_daddr_t, int); 87}
46STATIC xfs_buf_t *xfs_trans_buf_item_match_all(xfs_trans_t *, xfs_buftarg_t *,
47 xfs_daddr_t, int);
48 88
49/* 89/*
50 * Add the locked buffer to the transaction. 90 * Add the locked buffer to the transaction.
@@ -74,7 +114,7 @@ _xfs_trans_bjoin(
74 xfs_buf_item_init(bp, tp->t_mountp); 114 xfs_buf_item_init(bp, tp->t_mountp);
75 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); 115 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
76 ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); 116 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
77 ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); 117 ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
78 ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); 118 ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
79 if (reset_recur) 119 if (reset_recur)
80 bip->bli_recur = 0; 120 bip->bli_recur = 0;
@@ -112,14 +152,6 @@ xfs_trans_bjoin(
112 * within the transaction, just increment its lock recursion count 152 * within the transaction, just increment its lock recursion count
113 * and return a pointer to it. 153 * and return a pointer to it.
114 * 154 *
115 * Use the fast path function xfs_trans_buf_item_match() or the buffer
116 * cache routine incore_match() to find the buffer
117 * if it is already owned by this transaction.
118 *
119 * If we don't already own the buffer, use get_buf() to get it.
120 * If it doesn't yet have an associated xfs_buf_log_item structure,
121 * then allocate one and add the item to this transaction.
122 *
123 * If the transaction pointer is NULL, make this just a normal 155 * If the transaction pointer is NULL, make this just a normal
124 * get_buf() call. 156 * get_buf() call.
125 */ 157 */
@@ -149,11 +181,7 @@ xfs_trans_get_buf(xfs_trans_t *tp,
149 * have it locked. In this case we just increment the lock 181 * have it locked. In this case we just increment the lock
150 * recursion count and return the buffer to the caller. 182 * recursion count and return the buffer to the caller.
151 */ 183 */
152 if (tp->t_items.lic_next == NULL) { 184 bp = xfs_trans_buf_item_match(tp, target_dev, blkno, len);
153 bp = xfs_trans_buf_item_match(tp, target_dev, blkno, len);
154 } else {
155 bp = xfs_trans_buf_item_match_all(tp, target_dev, blkno, len);
156 }
157 if (bp != NULL) { 185 if (bp != NULL) {
158 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); 186 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
159 if (XFS_FORCED_SHUTDOWN(tp->t_mountp)) 187 if (XFS_FORCED_SHUTDOWN(tp->t_mountp))
@@ -259,14 +287,6 @@ int xfs_error_mod = 33;
259 * within the transaction and already read in, just increment its 287 * within the transaction and already read in, just increment its
260 * lock recursion count and return a pointer to it. 288 * lock recursion count and return a pointer to it.
261 * 289 *
262 * Use the fast path function xfs_trans_buf_item_match() or the buffer
263 * cache routine incore_match() to find the buffer
264 * if it is already owned by this transaction.
265 *
266 * If we don't already own the buffer, use read_buf() to get it.
267 * If it doesn't yet have an associated xfs_buf_log_item structure,
268 * then allocate one and add the item to this transaction.
269 *
270 * If the transaction pointer is NULL, make this just a normal 290 * If the transaction pointer is NULL, make this just a normal
271 * read_buf() call. 291 * read_buf() call.
272 */ 292 */
@@ -328,11 +348,7 @@ xfs_trans_read_buf(
328 * If the buffer is not yet read in, then we read it in, increment 348 * If the buffer is not yet read in, then we read it in, increment
329 * the lock recursion count, and return it to the caller. 349 * the lock recursion count, and return it to the caller.
330 */ 350 */
331 if (tp->t_items.lic_next == NULL) { 351 bp = xfs_trans_buf_item_match(tp, target, blkno, len);
332 bp = xfs_trans_buf_item_match(tp, target, blkno, len);
333 } else {
334 bp = xfs_trans_buf_item_match_all(tp, target, blkno, len);
335 }
336 if (bp != NULL) { 352 if (bp != NULL) {
337 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); 353 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
338 ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); 354 ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
@@ -495,7 +511,7 @@ xfs_trans_brelse(xfs_trans_t *tp,
495 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); 511 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
496 ASSERT(bip->bli_item.li_type == XFS_LI_BUF); 512 ASSERT(bip->bli_item.li_type == XFS_LI_BUF);
497 ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); 513 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
498 ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); 514 ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
499 ASSERT(atomic_read(&bip->bli_refcount) > 0); 515 ASSERT(atomic_read(&bip->bli_refcount) > 0);
500 516
501 /* 517 /*
@@ -603,7 +619,7 @@ xfs_trans_bhold(xfs_trans_t *tp,
603 619
604 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); 620 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
605 ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); 621 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
606 ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); 622 ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
607 ASSERT(atomic_read(&bip->bli_refcount) > 0); 623 ASSERT(atomic_read(&bip->bli_refcount) > 0);
608 bip->bli_flags |= XFS_BLI_HOLD; 624 bip->bli_flags |= XFS_BLI_HOLD;
609 trace_xfs_trans_bhold(bip); 625 trace_xfs_trans_bhold(bip);
@@ -625,7 +641,7 @@ xfs_trans_bhold_release(xfs_trans_t *tp,
625 641
626 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); 642 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
627 ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); 643 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
628 ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); 644 ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
629 ASSERT(atomic_read(&bip->bli_refcount) > 0); 645 ASSERT(atomic_read(&bip->bli_refcount) > 0);
630 ASSERT(bip->bli_flags & XFS_BLI_HOLD); 646 ASSERT(bip->bli_flags & XFS_BLI_HOLD);
631 bip->bli_flags &= ~XFS_BLI_HOLD; 647 bip->bli_flags &= ~XFS_BLI_HOLD;
@@ -688,7 +704,7 @@ xfs_trans_log_buf(xfs_trans_t *tp,
688 bip->bli_flags &= ~XFS_BLI_STALE; 704 bip->bli_flags &= ~XFS_BLI_STALE;
689 ASSERT(XFS_BUF_ISSTALE(bp)); 705 ASSERT(XFS_BUF_ISSTALE(bp));
690 XFS_BUF_UNSTALE(bp); 706 XFS_BUF_UNSTALE(bp);
691 bip->bli_format.blf_flags &= ~XFS_BLI_CANCEL; 707 bip->bli_format.blf_flags &= ~XFS_BLF_CANCEL;
692 } 708 }
693 709
694 lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip); 710 lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip);
@@ -696,7 +712,6 @@ xfs_trans_log_buf(xfs_trans_t *tp,
696 712
697 tp->t_flags |= XFS_TRANS_DIRTY; 713 tp->t_flags |= XFS_TRANS_DIRTY;
698 lidp->lid_flags |= XFS_LID_DIRTY; 714 lidp->lid_flags |= XFS_LID_DIRTY;
699 lidp->lid_flags &= ~XFS_LID_BUF_STALE;
700 bip->bli_flags |= XFS_BLI_LOGGED; 715 bip->bli_flags |= XFS_BLI_LOGGED;
701 xfs_buf_item_log(bip, first, last); 716 xfs_buf_item_log(bip, first, last);
702} 717}
@@ -747,8 +762,8 @@ xfs_trans_binval(
747 ASSERT(!(XFS_BUF_ISDELAYWRITE(bp))); 762 ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
748 ASSERT(XFS_BUF_ISSTALE(bp)); 763 ASSERT(XFS_BUF_ISSTALE(bp));
749 ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY))); 764 ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY)));
750 ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_INODE_BUF)); 765 ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_INODE_BUF));
751 ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); 766 ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
752 ASSERT(lidp->lid_flags & XFS_LID_DIRTY); 767 ASSERT(lidp->lid_flags & XFS_LID_DIRTY);
753 ASSERT(tp->t_flags & XFS_TRANS_DIRTY); 768 ASSERT(tp->t_flags & XFS_TRANS_DIRTY);
754 return; 769 return;
@@ -759,7 +774,7 @@ xfs_trans_binval(
759 * in the buf log item. The STALE flag will be used in 774 * in the buf log item. The STALE flag will be used in
760 * xfs_buf_item_unpin() to determine if it should clean up 775 * xfs_buf_item_unpin() to determine if it should clean up
761 * when the last reference to the buf item is given up. 776 * when the last reference to the buf item is given up.
762 * We set the XFS_BLI_CANCEL flag in the buf log format structure 777 * We set the XFS_BLF_CANCEL flag in the buf log format structure
763 * and log the buf item. This will be used at recovery time 778 * and log the buf item. This will be used at recovery time
764 * to determine that copies of the buffer in the log before 779 * to determine that copies of the buffer in the log before
765 * this should not be replayed. 780 * this should not be replayed.
@@ -777,26 +792,26 @@ xfs_trans_binval(
777 XFS_BUF_UNDELAYWRITE(bp); 792 XFS_BUF_UNDELAYWRITE(bp);
778 XFS_BUF_STALE(bp); 793 XFS_BUF_STALE(bp);
779 bip->bli_flags |= XFS_BLI_STALE; 794 bip->bli_flags |= XFS_BLI_STALE;
780 bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_DIRTY); 795 bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY);
781 bip->bli_format.blf_flags &= ~XFS_BLI_INODE_BUF; 796 bip->bli_format.blf_flags &= ~XFS_BLF_INODE_BUF;
782 bip->bli_format.blf_flags |= XFS_BLI_CANCEL; 797 bip->bli_format.blf_flags |= XFS_BLF_CANCEL;
783 memset((char *)(bip->bli_format.blf_data_map), 0, 798 memset((char *)(bip->bli_format.blf_data_map), 0,
784 (bip->bli_format.blf_map_size * sizeof(uint))); 799 (bip->bli_format.blf_map_size * sizeof(uint)));
785 lidp->lid_flags |= XFS_LID_DIRTY|XFS_LID_BUF_STALE; 800 lidp->lid_flags |= XFS_LID_DIRTY;
786 tp->t_flags |= XFS_TRANS_DIRTY; 801 tp->t_flags |= XFS_TRANS_DIRTY;
787} 802}
788 803
789/* 804/*
790 * This call is used to indicate that the buffer contains on-disk 805 * This call is used to indicate that the buffer contains on-disk inodes which
791 * inodes which must be handled specially during recovery. They 806 * must be handled specially during recovery. They require special handling
792 * require special handling because only the di_next_unlinked from 807 * because only the di_next_unlinked from the inodes in the buffer should be
793 * the inodes in the buffer should be recovered. The rest of the 808 * recovered. The rest of the data in the buffer is logged via the inodes
794 * data in the buffer is logged via the inodes themselves. 809 * themselves.
795 * 810 *
796 * All we do is set the XFS_BLI_INODE_BUF flag in the buffer's log 811 * All we do is set the XFS_BLI_INODE_BUF flag in the items flags so it can be
797 * format structure so that we'll know what to do at recovery time. 812 * transferred to the buffer's log format structure so that we'll know what to
813 * do at recovery time.
798 */ 814 */
799/* ARGSUSED */
800void 815void
801xfs_trans_inode_buf( 816xfs_trans_inode_buf(
802 xfs_trans_t *tp, 817 xfs_trans_t *tp,
@@ -811,7 +826,7 @@ xfs_trans_inode_buf(
811 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); 826 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
812 ASSERT(atomic_read(&bip->bli_refcount) > 0); 827 ASSERT(atomic_read(&bip->bli_refcount) > 0);
813 828
814 bip->bli_format.blf_flags |= XFS_BLI_INODE_BUF; 829 bip->bli_flags |= XFS_BLI_INODE_BUF;
815} 830}
816 831
817/* 832/*
@@ -893,120 +908,12 @@ xfs_trans_dquot_buf(
893 ASSERT(XFS_BUF_ISBUSY(bp)); 908 ASSERT(XFS_BUF_ISBUSY(bp));
894 ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); 909 ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
895 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); 910 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
896 ASSERT(type == XFS_BLI_UDQUOT_BUF || 911 ASSERT(type == XFS_BLF_UDQUOT_BUF ||
897 type == XFS_BLI_PDQUOT_BUF || 912 type == XFS_BLF_PDQUOT_BUF ||
898 type == XFS_BLI_GDQUOT_BUF); 913 type == XFS_BLF_GDQUOT_BUF);
899 914
900 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); 915 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
901 ASSERT(atomic_read(&bip->bli_refcount) > 0); 916 ASSERT(atomic_read(&bip->bli_refcount) > 0);
902 917
903 bip->bli_format.blf_flags |= type; 918 bip->bli_format.blf_flags |= type;
904} 919}
905
906/*
907 * Check to see if a buffer matching the given parameters is already
908 * a part of the given transaction. Only check the first, embedded
909 * chunk, since we don't want to spend all day scanning large transactions.
910 */
911STATIC xfs_buf_t *
912xfs_trans_buf_item_match(
913 xfs_trans_t *tp,
914 xfs_buftarg_t *target,
915 xfs_daddr_t blkno,
916 int len)
917{
918 xfs_log_item_chunk_t *licp;
919 xfs_log_item_desc_t *lidp;
920 xfs_buf_log_item_t *blip;
921 xfs_buf_t *bp;
922 int i;
923
924 bp = NULL;
925 len = BBTOB(len);
926 licp = &tp->t_items;
927 if (!xfs_lic_are_all_free(licp)) {
928 for (i = 0; i < licp->lic_unused; i++) {
929 /*
930 * Skip unoccupied slots.
931 */
932 if (xfs_lic_isfree(licp, i)) {
933 continue;
934 }
935
936 lidp = xfs_lic_slot(licp, i);
937 blip = (xfs_buf_log_item_t *)lidp->lid_item;
938 if (blip->bli_item.li_type != XFS_LI_BUF) {
939 continue;
940 }
941
942 bp = blip->bli_buf;
943 if ((XFS_BUF_TARGET(bp) == target) &&
944 (XFS_BUF_ADDR(bp) == blkno) &&
945 (XFS_BUF_COUNT(bp) == len)) {
946 /*
947 * We found it. Break out and
948 * return the pointer to the buffer.
949 */
950 break;
951 } else {
952 bp = NULL;
953 }
954 }
955 }
956 return bp;
957}
958
959/*
960 * Check to see if a buffer matching the given parameters is already
961 * a part of the given transaction. Check all the chunks, we
962 * want to be thorough.
963 */
964STATIC xfs_buf_t *
965xfs_trans_buf_item_match_all(
966 xfs_trans_t *tp,
967 xfs_buftarg_t *target,
968 xfs_daddr_t blkno,
969 int len)
970{
971 xfs_log_item_chunk_t *licp;
972 xfs_log_item_desc_t *lidp;
973 xfs_buf_log_item_t *blip;
974 xfs_buf_t *bp;
975 int i;
976
977 bp = NULL;
978 len = BBTOB(len);
979 for (licp = &tp->t_items; licp != NULL; licp = licp->lic_next) {
980 if (xfs_lic_are_all_free(licp)) {
981 ASSERT(licp == &tp->t_items);
982 ASSERT(licp->lic_next == NULL);
983 return NULL;
984 }
985 for (i = 0; i < licp->lic_unused; i++) {
986 /*
987 * Skip unoccupied slots.
988 */
989 if (xfs_lic_isfree(licp, i)) {
990 continue;
991 }
992
993 lidp = xfs_lic_slot(licp, i);
994 blip = (xfs_buf_log_item_t *)lidp->lid_item;
995 if (blip->bli_item.li_type != XFS_LI_BUF) {
996 continue;
997 }
998
999 bp = blip->bli_buf;
1000 if ((XFS_BUF_TARGET(bp) == target) &&
1001 (XFS_BUF_ADDR(bp) == blkno) &&
1002 (XFS_BUF_COUNT(bp) == len)) {
1003 /*
1004 * We found it. Break out and
1005 * return the pointer to the buffer.
1006 */
1007 return bp;
1008 }
1009 }
1010 }
1011 return NULL;
1012}
diff --git a/fs/xfs/xfs_trans_item.c b/fs/xfs/xfs_trans_item.c
index eb3fc57f9eef..f11d37d06dcc 100644
--- a/fs/xfs/xfs_trans_item.c
+++ b/fs/xfs/xfs_trans_item.c
@@ -299,6 +299,7 @@ xfs_trans_next_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp)
299void 299void
300xfs_trans_free_items( 300xfs_trans_free_items(
301 xfs_trans_t *tp, 301 xfs_trans_t *tp,
302 xfs_lsn_t commit_lsn,
302 int flags) 303 int flags)
303{ 304{
304 xfs_log_item_chunk_t *licp; 305 xfs_log_item_chunk_t *licp;
@@ -311,7 +312,7 @@ xfs_trans_free_items(
311 * Special case the embedded chunk so we don't free it below. 312 * Special case the embedded chunk so we don't free it below.
312 */ 313 */
313 if (!xfs_lic_are_all_free(licp)) { 314 if (!xfs_lic_are_all_free(licp)) {
314 (void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN); 315 (void) xfs_trans_unlock_chunk(licp, 1, abort, commit_lsn);
315 xfs_lic_all_free(licp); 316 xfs_lic_all_free(licp);
316 licp->lic_unused = 0; 317 licp->lic_unused = 0;
317 } 318 }
@@ -322,7 +323,7 @@ xfs_trans_free_items(
322 */ 323 */
323 while (licp != NULL) { 324 while (licp != NULL) {
324 ASSERT(!xfs_lic_are_all_free(licp)); 325 ASSERT(!xfs_lic_are_all_free(licp));
325 (void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN); 326 (void) xfs_trans_unlock_chunk(licp, 1, abort, commit_lsn);
326 next_licp = licp->lic_next; 327 next_licp = licp->lic_next;
327 kmem_free(licp); 328 kmem_free(licp);
328 licp = next_licp; 329 licp = next_licp;
@@ -438,112 +439,3 @@ xfs_trans_unlock_chunk(
438 439
439 return freed; 440 return freed;
440} 441}
441
442
443/*
444 * This is called to add the given busy item to the transaction's
445 * list of busy items. It must find a free busy item descriptor
446 * or allocate a new one and add the item to that descriptor.
447 * The function returns a pointer to busy descriptor used to point
448 * to the new busy entry. The log busy entry will now point to its new
449 * descriptor with its ???? field.
450 */
451xfs_log_busy_slot_t *
452xfs_trans_add_busy(xfs_trans_t *tp, xfs_agnumber_t ag, xfs_extlen_t idx)
453{
454 xfs_log_busy_chunk_t *lbcp;
455 xfs_log_busy_slot_t *lbsp;
456 int i=0;
457
458 /*
459 * If there are no free descriptors, allocate a new chunk
460 * of them and put it at the front of the chunk list.
461 */
462 if (tp->t_busy_free == 0) {
463 lbcp = (xfs_log_busy_chunk_t*)
464 kmem_alloc(sizeof(xfs_log_busy_chunk_t), KM_SLEEP);
465 ASSERT(lbcp != NULL);
466 /*
467 * Initialize the chunk, and then
468 * claim the first slot in the newly allocated chunk.
469 */
470 XFS_LBC_INIT(lbcp);
471 XFS_LBC_CLAIM(lbcp, 0);
472 lbcp->lbc_unused = 1;
473 lbsp = XFS_LBC_SLOT(lbcp, 0);
474
475 /*
476 * Link in the new chunk and update the free count.
477 */
478 lbcp->lbc_next = tp->t_busy.lbc_next;
479 tp->t_busy.lbc_next = lbcp;
480 tp->t_busy_free = XFS_LIC_NUM_SLOTS - 1;
481
482 /*
483 * Initialize the descriptor and the generic portion
484 * of the log item.
485 *
486 * Point the new slot at this item and return it.
487 * Also point the log item at its currently active
488 * descriptor and set the item's mount pointer.
489 */
490 lbsp->lbc_ag = ag;
491 lbsp->lbc_idx = idx;
492 return lbsp;
493 }
494
495 /*
496 * Find the free descriptor. It is somewhere in the chunklist
497 * of descriptors.
498 */
499 lbcp = &tp->t_busy;
500 while (lbcp != NULL) {
501 if (XFS_LBC_VACANCY(lbcp)) {
502 if (lbcp->lbc_unused <= XFS_LBC_MAX_SLOT) {
503 i = lbcp->lbc_unused;
504 break;
505 } else {
506 /* out-of-order vacancy */
507 cmn_err(CE_DEBUG, "OOO vacancy lbcp 0x%p\n", lbcp);
508 ASSERT(0);
509 }
510 }
511 lbcp = lbcp->lbc_next;
512 }
513 ASSERT(lbcp != NULL);
514 /*
515 * If we find a free descriptor, claim it,
516 * initialize it, and return it.
517 */
518 XFS_LBC_CLAIM(lbcp, i);
519 if (lbcp->lbc_unused <= i) {
520 lbcp->lbc_unused = i + 1;
521 }
522 lbsp = XFS_LBC_SLOT(lbcp, i);
523 tp->t_busy_free--;
524 lbsp->lbc_ag = ag;
525 lbsp->lbc_idx = idx;
526 return lbsp;
527}
528
529
530/*
531 * xfs_trans_free_busy
532 * Free all of the busy lists from a transaction
533 */
534void
535xfs_trans_free_busy(xfs_trans_t *tp)
536{
537 xfs_log_busy_chunk_t *lbcp;
538 xfs_log_busy_chunk_t *lbcq;
539
540 lbcp = tp->t_busy.lbc_next;
541 while (lbcp != NULL) {
542 lbcq = lbcp->lbc_next;
543 kmem_free(lbcp);
544 lbcp = lbcq;
545 }
546
547 XFS_LBC_INIT(&tp->t_busy);
548 tp->t_busy.lbc_unused = 0;
549}
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 73e2ad397432..c6e4f2c8de6e 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -35,13 +35,14 @@ struct xfs_log_item_desc *xfs_trans_find_item(struct xfs_trans *,
35struct xfs_log_item_desc *xfs_trans_first_item(struct xfs_trans *); 35struct xfs_log_item_desc *xfs_trans_first_item(struct xfs_trans *);
36struct xfs_log_item_desc *xfs_trans_next_item(struct xfs_trans *, 36struct xfs_log_item_desc *xfs_trans_next_item(struct xfs_trans *,
37 struct xfs_log_item_desc *); 37 struct xfs_log_item_desc *);
38void xfs_trans_free_items(struct xfs_trans *, int); 38
39void xfs_trans_unlock_items(struct xfs_trans *, 39void xfs_trans_unlock_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn);
40 xfs_lsn_t); 40void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
41void xfs_trans_free_busy(xfs_trans_t *tp); 41 int flags);
42xfs_log_busy_slot_t *xfs_trans_add_busy(xfs_trans_t *tp, 42
43 xfs_agnumber_t ag, 43void xfs_trans_item_committed(struct xfs_log_item *lip,
44 xfs_extlen_t idx); 44 xfs_lsn_t commit_lsn, int aborted);
45void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);
45 46
46/* 47/*
47 * AIL traversal cursor. 48 * AIL traversal cursor.
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index b09904555d07..320775295e32 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -75,6 +75,8 @@ typedef __uint32_t xfs_dahash_t; /* dir/attr hash value */
75 75
76typedef __uint16_t xfs_prid_t; /* prid_t truncated to 16bits in XFS */ 76typedef __uint16_t xfs_prid_t; /* prid_t truncated to 16bits in XFS */
77 77
78typedef __uint32_t xlog_tid_t; /* transaction ID type */
79
78/* 80/*
79 * These types are 64 bits on disk but are either 32 or 64 bits in memory. 81 * These types are 64 bits on disk but are either 32 or 64 bits in memory.
80 * Disk based types: 82 * Disk based types: